In [23]:
import gym
import numpy as np

In [24]:
# 创建环境
env = gym.make('CliffWalking-v0',render_mode='human')

In [25]:
from torch import nn
import torch

In [26]:
class PolicyNet(nn.Module):
    
    def __init__(self,input_dim,output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim,200)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(200,output_dim)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self,state):
        ### n
        x = self.linear1(state)
        x = self.relu(x)
        x = self.linear2(x) # n
        x = self.softmax(x) # n
        return x

In [27]:
class ValueNet(nn.Module):
    ### 用来学习值
    
    def __init__(self,input_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim,200)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(200,1)
    
    def forward(self,state):
        ### n
        x = self.linear1(state)
        x = self.relu(x)
        x = self.linear2(x) # n
        return x

In [28]:
from torch.distributions import Categorical
import numpy as np
np.bool8 = np.bool_

from torch.optim import AdamW

In [29]:
class Agent:
    
    def __init__(self):
        self.policy_net = PolicyNet(48,4)
        self.value_net = ValueNet(48)
        self.optimizer = AdamW(self.policy_net.parameters(),lr=1e-3)
        self.value_optimizer = AdamW(self.value_net.parameters(),lr=1e-3)
    
    def sample_action(self,state):
        probs = self.policy_net(state) # 4
        if np.random.uniform() < 0.5:
            action = np.random.randint(0,4)
            return action, torch.log(probs[action]+1e-8).detach()
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action).detach()
        return action.item(),log_prob
    
    def update(self,rewards,log_probs,xs,old_actions):
        ### 一次游戏时间
        ret = []
        adding = 0
        for r in rewards[::-1]:
            adding = adding * 0.99 + r
            ret.insert(0,adding)
        ret = torch.FloatTensor(ret)
        ret = ret - ret.mean()
        ret = ret / (ret.std()+1e-8)
        
        
        
        for _ in range(4):
            values = self.value_net(xs) ## B,48 ==> B,1
#             print(values.shape)

            new_probs = self.policy_net(xs) ## B,4
            dist = Categorical(new_probs) # B,4
#             actions = dist.sample() # B,1
            new_logprobs = dist.log_prob(old_actions.squeeze(-1)) # B,1
#             print('new log probs',new_logprobs.shape)
            advantages = ret - values.squeeze(-1).detach() # B
#             print('adv shape',advantages.shape,'ret shape',ret.shape,'value shape',values.shape,'log',log_probs.shape)
#             print('new_probs',new_probs.shape,'new_logprobs shape',new_logprobs.shape,'old_actions',old_actions.shape)
            ratio = torch.exp(new_logprobs - log_probs.squeeze(-1)) ## B
#             print('shape',ratio.shape)
            surr1 = ratio * advantages.squeeze(-1)
            surr2 = torch.clamp(ratio,0.8,1.2) * advantages
            loss = -torch.min(surr1,surr2).mean()


            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()


            value_loss = (values.squeeze(0) - ret).pow(2).mean()
            self.value_optimizer.zero_grad()
            value_loss.backward()
            self.value_optimizer.step()


    #         r_log_probs = []
    #         for r,log_prob in zip(ret,log_probs):
    #             r_log_probs.append(-r*log_prob)
    #         r_log_probs = torch.vstack(r_log_probs)

    #         loss = r_log_probs.sum()



            return loss

In [30]:
def convert2tensor(state):
    state_arr = np.zeros(48)
    state_arr[state] = 1
    state_arr = torch.FloatTensor(state_arr)
    return state_arr

In [31]:
def train(agent,env):
    success_count = []
    max_size = 1000
    for epoch in range(200000):
        rewards = []
        xs = []
        log_probs = []
        old_actions = []
        terminated = False
        success = True
        state,_ = env.reset()
        while not terminated and len(log_probs) < max_size:
            state_arr = convert2tensor(state)
            xs.append(state_arr)
            action, log_prob = agent.sample_action(state_arr)
            next_state, reward, terminated, truncated, _ = env.step(action)
            if reward == -100:
                reward = -10
                success = False
            if reward == -1 and state == 47:
                reward = 10
            state = next_state
            rewards.append(reward)
            log_probs.append(log_prob)
            old_actions.append(action)
        xs = torch.vstack(xs)
        log_probs = torch.vstack(log_probs)
        old_actions = torch.LongTensor(old_actions)
        loss = agent.update(rewards,log_probs,xs,old_actions) 
        success_count.append(success)
        
        
        if (epoch+1) % 10 == 0:
            print(f'success rate:  {len([s for s in success_count[-100:] if s]) / 100}')
            print(f'epoch: {epoch}, loss: {loss}, rewards: {sum(rewards)}, count: {len(rewards)}')

In [32]:
agent = Agent()



In [33]:
env = gym.make('CliffWalking-v0')
train(agent,env)

success rate:  0.0
epoch: 9, loss: -0.0005002827383577824, rewards: -1489, count: 913
success rate:  0.0
epoch: 19, loss: 0.0009156869491562247, rewards: -1684, count: 1000
success rate:  0.0
epoch: 29, loss: -0.00020552253408823162, rewards: -1468, count: 1000
success rate:  0.0
epoch: 39, loss: -0.0006219861679710448, rewards: -1522, count: 1000
success rate:  0.0
epoch: 49, loss: 0.00031052465783432126, rewards: -307, count: 253
success rate:  0.01
epoch: 59, loss: 0.0002517204266041517, rewards: -1405, count: 1000
success rate:  0.01
epoch: 69, loss: -0.00013774204126093537, rewards: -1216, count: 1000
success rate:  0.01
epoch: 79, loss: -0.00012465739564504474, rewards: -1117, count: 1000
success rate:  0.01
epoch: 89, loss: -1.9902738131349906e-05, rewards: -460, count: 343
success rate:  0.02
epoch: 99, loss: 1.6390324162784964e-05, rewards: -1153, count: 1000
success rate:  0.03
epoch: 109, loss: 0.00012695646728388965, rewards: -1099, count: 1000
success rate:  0.03
epoch: 11

success rate:  0.33
epoch: 969, loss: 0.40849626064300537, rewards: -198, count: 171
success rate:  0.33
epoch: 979, loss: 0.4631671905517578, rewards: -74, count: 74
success rate:  0.36
epoch: 989, loss: 0.3255077302455902, rewards: -465, count: 411
success rate:  0.39
epoch: 999, loss: 0.6999964714050293, rewards: -53, count: 53
success rate:  0.35
epoch: 1009, loss: 0.42863205075263977, rewards: -510, count: 465
success rate:  0.34
epoch: 1019, loss: 0.47540390491485596, rewards: -234, count: 234
success rate:  0.3
epoch: 1029, loss: 0.48522308468818665, rewards: -216, count: 207
success rate:  0.33
epoch: 1039, loss: 0.4274773895740509, rewards: -306, count: 297
success rate:  0.33
epoch: 1049, loss: 0.8529460430145264, rewards: -88, count: 88
success rate:  0.32
epoch: 1059, loss: 0.7436782717704773, rewards: -54, count: 54
success rate:  0.33
epoch: 1069, loss: 0.4645916223526001, rewards: -194, count: 185
success rate:  0.32
epoch: 1079, loss: 0.5357823967933655, rewards: -104, 

success rate:  0.39
epoch: 1939, loss: 1.0798730850219727, rewards: -273, count: 246
success rate:  0.41
epoch: 1949, loss: 1.5742977857589722, rewards: -62, count: 62
success rate:  0.4
epoch: 1959, loss: 0.5436072945594788, rewards: -294, count: 267
success rate:  0.41
epoch: 1969, loss: 1.1786221265792847, rewards: -406, count: 397
success rate:  0.38
epoch: 1979, loss: 1.048628330230713, rewards: -637, count: 619
success rate:  0.39
epoch: 1989, loss: 1.207743763923645, rewards: -204, count: 204
success rate:  0.42
epoch: 1999, loss: 1.1155527830123901, rewards: -393, count: 366
success rate:  0.38
epoch: 2009, loss: 0.739505410194397, rewards: -217, count: 199
success rate:  0.36
epoch: 2019, loss: 1.2092152833938599, rewards: -262, count: 226
success rate:  0.38
epoch: 2029, loss: 1.1085785627365112, rewards: -53, count: 44
success rate:  0.38
epoch: 2039, loss: 1.3647011518478394, rewards: -224, count: 206
success rate:  0.35
epoch: 2049, loss: 1.20438814163208, rewards: -162, c

success rate:  0.31
epoch: 2919, loss: 0.4944266378879547, rewards: -318, count: 282
success rate:  0.31
epoch: 2929, loss: 1.7381693124771118, rewards: -213, count: 168
success rate:  0.38
epoch: 2939, loss: 0.4490331709384918, rewards: -93, count: 93
success rate:  0.42
epoch: 2949, loss: 0.7737452983856201, rewards: -59, count: 59
success rate:  0.46
epoch: 2959, loss: 1.6295057535171509, rewards: -35, count: 35
success rate:  0.47
epoch: 2969, loss: 0.8736204504966736, rewards: -61, count: 52
success rate:  0.43
epoch: 2979, loss: 0.5858422517776489, rewards: -471, count: 426
success rate:  0.41
epoch: 2989, loss: 1.221670389175415, rewards: -219, count: 210
success rate:  0.41
epoch: 2999, loss: 0.6726319193840027, rewards: -461, count: 407
success rate:  0.37
epoch: 3009, loss: 1.34487783908844, rewards: -79, count: 79
success rate:  0.38
epoch: 3019, loss: 1.2180659770965576, rewards: -203, count: 194
success rate:  0.39
epoch: 3029, loss: 0.7393692135810852, rewards: -279, coun

success rate:  0.38
epoch: 3899, loss: 0.8594244122505188, rewards: -80, count: 80
success rate:  0.35
epoch: 3909, loss: 1.3144080638885498, rewards: -47, count: 47
success rate:  0.36
epoch: 3919, loss: 0.6799717545509338, rewards: -146, count: 137
success rate:  0.41
epoch: 3929, loss: 1.0925215482711792, rewards: -455, count: 410
success rate:  0.42
epoch: 3939, loss: 1.0549952983856201, rewards: -124, count: 115
success rate:  0.39
epoch: 3949, loss: 1.3488167524337769, rewards: -78, count: 78
success rate:  0.34
epoch: 3959, loss: 1.2383522987365723, rewards: -103, count: 94
success rate:  0.34
epoch: 3969, loss: 0.7669990658760071, rewards: -200, count: 191
success rate:  0.36
epoch: 3979, loss: 0.922884464263916, rewards: -483, count: 411
success rate:  0.35
epoch: 3989, loss: 1.402268886566162, rewards: -206, count: 170
success rate:  0.34
epoch: 3999, loss: 1.6786166429519653, rewards: -161, count: 152
success rate:  0.32
epoch: 4009, loss: 0.39825090765953064, rewards: -180,

success rate:  0.26
epoch: 4879, loss: 1.2297756671905518, rewards: -86, count: 86
success rate:  0.28
epoch: 4889, loss: 1.5961334705352783, rewards: -104, count: 95
success rate:  0.28
epoch: 4899, loss: 0.8251155614852905, rewards: -253, count: 235
success rate:  0.28
epoch: 4909, loss: 1.5234098434448242, rewards: -67, count: 67
success rate:  0.26
epoch: 4919, loss: 0.8011763095855713, rewards: -256, count: 238
success rate:  0.27
epoch: 4929, loss: 0.7463894486427307, rewards: -741, count: 642
success rate:  0.26
epoch: 4939, loss: 0.5592401027679443, rewards: -595, count: 559
success rate:  0.25
epoch: 4949, loss: 0.7800356149673462, rewards: -579, count: 543
success rate:  0.25
epoch: 4959, loss: 0.7572252750396729, rewards: -910, count: 811
success rate:  0.25
epoch: 4969, loss: 1.2232329845428467, rewards: -183, count: 165
success rate:  0.26
epoch: 4979, loss: 1.0375927686691284, rewards: -72, count: 72
success rate:  0.27
epoch: 4989, loss: 0.7965452075004578, rewards: -259

success rate:  0.3
epoch: 5869, loss: 0.8918043971061707, rewards: -179, count: 161
success rate:  0.31
epoch: 5879, loss: 0.5065383315086365, rewards: -840, count: 732
success rate:  0.32
epoch: 5889, loss: 1.2681057453155518, rewards: -226, count: 172
success rate:  0.35
epoch: 5899, loss: 0.792506992816925, rewards: -551, count: 488
success rate:  0.33
epoch: 5909, loss: 0.5054227113723755, rewards: -132, count: 123
success rate:  0.32
epoch: 5919, loss: 0.9589634537696838, rewards: -451, count: 433
success rate:  0.3
epoch: 5929, loss: 1.196694254875183, rewards: -148, count: 139
success rate:  0.3
epoch: 5939, loss: 1.289408802986145, rewards: -420, count: 384
success rate:  0.3
epoch: 5949, loss: 0.3749823570251465, rewards: -39, count: 39
success rate:  0.29
epoch: 5959, loss: 1.1177551746368408, rewards: -300, count: 291
success rate:  0.31
epoch: 5969, loss: 1.0192760229110718, rewards: -201, count: 192
success rate:  0.3
epoch: 5979, loss: 1.2561506032943726, rewards: -103, c

success rate:  0.36
epoch: 6859, loss: 0.6112667918205261, rewards: -189, count: 162
success rate:  0.36
epoch: 6869, loss: 1.945125937461853, rewards: -134, count: 125
success rate:  0.38
epoch: 6879, loss: 2.0276999473571777, rewards: -55, count: 37
success rate:  0.37
epoch: 6889, loss: 1.5342313051223755, rewards: -136, count: 136
success rate:  0.36
epoch: 6899, loss: 0.8896664977073669, rewards: -131, count: 104
success rate:  0.37
epoch: 6909, loss: 1.321981430053711, rewards: -185, count: 167
success rate:  0.37
epoch: 6919, loss: 1.446082592010498, rewards: -205, count: 196
success rate:  0.38
epoch: 6929, loss: 1.0149545669555664, rewards: -59, count: 59
success rate:  0.38
epoch: 6939, loss: 1.6636894941329956, rewards: -103, count: 103
success rate:  0.37
epoch: 6949, loss: 0.30966049432754517, rewards: -443, count: 407
success rate:  0.35
epoch: 6959, loss: 1.2651597261428833, rewards: -330, count: 294
success rate:  0.36
epoch: 6969, loss: 0.6291749477386475, rewards: -18

success rate:  0.33
epoch: 7839, loss: 0.9459988474845886, rewards: -228, count: 219
success rate:  0.35
epoch: 7849, loss: 0.7868765592575073, rewards: -175, count: 166
success rate:  0.37
epoch: 7859, loss: 1.429580807685852, rewards: -158, count: 149
success rate:  0.36
epoch: 7869, loss: 0.9785341620445251, rewards: -247, count: 229
success rate:  0.39
epoch: 7879, loss: 0.9340823888778687, rewards: -609, count: 546
success rate:  0.4
epoch: 7889, loss: 0.46942174434661865, rewards: -82, count: 82
success rate:  0.41
epoch: 7899, loss: 1.2409569025039673, rewards: -390, count: 381
success rate:  0.34
epoch: 7909, loss: 0.9065089821815491, rewards: -49, count: 49
success rate:  0.33
epoch: 7919, loss: 1.1913726329803467, rewards: -244, count: 208
success rate:  0.31
epoch: 7929, loss: 0.7477927803993225, rewards: -198, count: 189
success rate:  0.3
epoch: 7939, loss: 1.4537665843963623, rewards: -80, count: 80
success rate:  0.3
epoch: 7949, loss: 0.7098883390426636, rewards: -104, 

success rate:  0.3
epoch: 8819, loss: 1.2926603555679321, rewards: -182, count: 128
success rate:  0.31
epoch: 8829, loss: 1.2997944355010986, rewards: -100, count: 91
success rate:  0.29
epoch: 8839, loss: 0.9510076642036438, rewards: -253, count: 235
success rate:  0.27
epoch: 8849, loss: 1.03285813331604, rewards: -205, count: 178
success rate:  0.3
epoch: 8859, loss: 0.8896381855010986, rewards: -267, count: 231
success rate:  0.23
epoch: 8869, loss: 1.3260552883148193, rewards: -140, count: 131
success rate:  0.26
epoch: 8879, loss: 0.36525535583496094, rewards: -255, count: 219
success rate:  0.26
epoch: 8889, loss: 1.1609890460968018, rewards: -107, count: 98
success rate:  0.27
epoch: 8899, loss: 1.1966867446899414, rewards: -322, count: 304
success rate:  0.28
epoch: 8909, loss: 1.0891010761260986, rewards: -154, count: 145
success rate:  0.28
epoch: 8919, loss: 0.7392780780792236, rewards: -243, count: 207
success rate:  0.26
epoch: 8929, loss: 0.45330920815467834, rewards: -

success rate:  0.29
epoch: 9799, loss: 0.6247290968894958, rewards: -96, count: 96
success rate:  0.27
epoch: 9809, loss: 1.5791926383972168, rewards: -102, count: 93
success rate:  0.26
epoch: 9819, loss: 1.2176302671432495, rewards: -206, count: 179
success rate:  0.29
epoch: 9829, loss: 1.4193488359451294, rewards: -128, count: 128
success rate:  0.29
epoch: 9839, loss: 1.4772891998291016, rewards: -48, count: 48
success rate:  0.31
epoch: 9849, loss: 0.5857937335968018, rewards: -252, count: 225
success rate:  0.34
epoch: 9859, loss: 1.134000301361084, rewards: -491, count: 446
success rate:  0.33
epoch: 9869, loss: 0.702226996421814, rewards: -236, count: 209
success rate:  0.34
epoch: 9879, loss: 0.3592214584350586, rewards: -149, count: 140
success rate:  0.36
epoch: 9889, loss: 1.3291507959365845, rewards: -132, count: 123
success rate:  0.39
epoch: 9899, loss: 0.7459642291069031, rewards: -263, count: 236
success rate:  0.39
epoch: 9909, loss: 0.9576100707054138, rewards: -567

success rate:  0.37
epoch: 10779, loss: 1.0382261276245117, rewards: -370, count: 343
success rate:  0.33
epoch: 10789, loss: 1.0961909294128418, rewards: -259, count: 232
success rate:  0.36
epoch: 10799, loss: 1.312455654144287, rewards: -373, count: 337
success rate:  0.35
epoch: 10809, loss: 0.6989484429359436, rewards: -807, count: 744
success rate:  0.32
epoch: 10819, loss: 1.1130359172821045, rewards: -88, count: 79
success rate:  0.28
epoch: 10829, loss: 0.7737442851066589, rewards: -135, count: 126
success rate:  0.24
epoch: 10839, loss: 0.6981708407402039, rewards: -71, count: 71
success rate:  0.25
epoch: 10849, loss: 0.5310783386230469, rewards: -323, count: 314
success rate:  0.26
epoch: 10859, loss: 0.31060636043548584, rewards: -118, count: 118
success rate:  0.24
epoch: 10869, loss: 1.7956832647323608, rewards: -133, count: 115
success rate:  0.25
epoch: 10879, loss: 0.3344564735889435, rewards: -196, count: 169
success rate:  0.31
epoch: 10889, loss: 1.5108329057693481

success rate:  0.29
epoch: 11759, loss: 0.9627788662910461, rewards: -438, count: 366
success rate:  0.29
epoch: 11769, loss: 1.4396032094955444, rewards: -98, count: 98
success rate:  0.31
epoch: 11779, loss: 1.186587929725647, rewards: -169, count: 160
success rate:  0.31
epoch: 11789, loss: 0.8446720838546753, rewards: -190, count: 172
success rate:  0.33
epoch: 11799, loss: 0.6087685823440552, rewards: -371, count: 335
success rate:  0.36
epoch: 11809, loss: 1.1167829036712646, rewards: -141, count: 123
success rate:  0.38
epoch: 11819, loss: 0.520411491394043, rewards: -183, count: 165
success rate:  0.37
epoch: 11829, loss: 1.0535353422164917, rewards: -216, count: 198
success rate:  0.38
epoch: 11839, loss: 1.5697084665298462, rewards: -109, count: 91
success rate:  0.37
epoch: 11849, loss: 0.8635168075561523, rewards: -266, count: 239
success rate:  0.39
epoch: 11859, loss: 0.7977361679077148, rewards: -74, count: 65
success rate:  0.38
epoch: 11869, loss: 1.1997822523117065, r

success rate:  0.29
epoch: 12729, loss: 1.2171730995178223, rewards: -122, count: 113
success rate:  0.29
epoch: 12739, loss: 0.3554132878780365, rewards: -510, count: 429
success rate:  0.26
epoch: 12749, loss: 0.5841261148452759, rewards: -163, count: 154
success rate:  0.21
epoch: 12759, loss: 1.4966957569122314, rewards: -176, count: 167
success rate:  0.21
epoch: 12769, loss: 1.2043660879135132, rewards: -344, count: 317
success rate:  0.2
epoch: 12779, loss: 1.5739216804504395, rewards: -124, count: 124
success rate:  0.22
epoch: 12789, loss: 1.1691288948059082, rewards: -85, count: 67
success rate:  0.23
epoch: 12799, loss: 0.5513585805892944, rewards: -45, count: 36
success rate:  0.26
epoch: 12809, loss: 1.9445158243179321, rewards: -90, count: 90
success rate:  0.27
epoch: 12819, loss: 1.5392601490020752, rewards: -113, count: 113
success rate:  0.3
epoch: 12829, loss: 1.063560128211975, rewards: -114, count: 114
success rate:  0.29
epoch: 12839, loss: 0.8338997960090637, rew

success rate:  0.34
epoch: 13689, loss: 1.1844122409820557, rewards: -148, count: 130
success rate:  0.34
epoch: 13699, loss: 0.2756302058696747, rewards: -240, count: 195
success rate:  0.39
epoch: 13709, loss: 1.0862433910369873, rewards: -177, count: 177
success rate:  0.38
epoch: 13719, loss: 1.2407833337783813, rewards: -437, count: 383
success rate:  0.35
epoch: 13729, loss: 0.6710066199302673, rewards: -207, count: 189
success rate:  0.39
epoch: 13739, loss: 1.4662355184555054, rewards: -429, count: 384
success rate:  0.38
epoch: 13749, loss: 0.7274215817451477, rewards: -389, count: 326
success rate:  0.37
epoch: 13759, loss: 1.2377824783325195, rewards: -277, count: 232
success rate:  0.34
epoch: 13769, loss: 0.5681973695755005, rewards: -489, count: 399
success rate:  0.32
epoch: 13779, loss: 0.34157857298851013, rewards: -53, count: 53
success rate:  0.33
epoch: 13789, loss: 0.5998977422714233, rewards: -316, count: 289
success rate:  0.31
epoch: 13799, loss: 1.2481510639190

success rate:  0.36
epoch: 14649, loss: 1.090487003326416, rewards: -123, count: 105
success rate:  0.37
epoch: 14659, loss: 0.5935896635055542, rewards: -683, count: 620
success rate:  0.37
epoch: 14669, loss: 0.8797032237052917, rewards: -117, count: 117
success rate:  0.36
epoch: 14679, loss: 0.9727287292480469, rewards: -80, count: 80
success rate:  0.37
epoch: 14689, loss: 1.882175087928772, rewards: -141, count: 123
success rate:  0.38
epoch: 14699, loss: 0.8632408380508423, rewards: -232, count: 196
success rate:  0.38
epoch: 14709, loss: 1.3714479207992554, rewards: -159, count: 159
success rate:  0.39
epoch: 14719, loss: 1.2904746532440186, rewards: -55, count: 55
success rate:  0.37
epoch: 14729, loss: 0.6559781432151794, rewards: -905, count: 842
success rate:  0.37
epoch: 14739, loss: 1.4447442293167114, rewards: -315, count: 288
success rate:  0.37
epoch: 14749, loss: 1.0879802703857422, rewards: -99, count: 99
success rate:  0.37
epoch: 14759, loss: 1.568917989730835, rew

success rate:  0.33
epoch: 15619, loss: 1.1102242469787598, rewards: -70, count: 70
success rate:  0.33
epoch: 15629, loss: 0.9838852882385254, rewards: -36, count: 27
success rate:  0.33
epoch: 15639, loss: 1.1019383668899536, rewards: -132, count: 132
success rate:  0.29
epoch: 15649, loss: 1.629022240638733, rewards: -179, count: 143
success rate:  0.31
epoch: 15659, loss: 1.1639063358306885, rewards: -112, count: 112
success rate:  0.32
epoch: 15669, loss: 1.252795696258545, rewards: -182, count: 146
success rate:  0.39
epoch: 15679, loss: 1.3863074779510498, rewards: -263, count: 245
success rate:  0.38
epoch: 15689, loss: 0.9009661078453064, rewards: -71, count: 71
success rate:  0.37
epoch: 15699, loss: 0.4832119941711426, rewards: -40, count: 40
success rate:  0.36
epoch: 15709, loss: 1.199448823928833, rewards: -97, count: 97
success rate:  0.34
epoch: 15719, loss: 1.5236870050430298, rewards: -141, count: 132
success rate:  0.35
epoch: 15729, loss: 0.7463043928146362, rewards

success rate:  0.35
epoch: 16599, loss: 1.6021058559417725, rewards: -222, count: 213
success rate:  0.36
epoch: 16609, loss: 0.9002735614776611, rewards: -578, count: 497
success rate:  0.36
epoch: 16619, loss: 1.0009301900863647, rewards: -400, count: 346
success rate:  0.36
epoch: 16629, loss: 0.5965694785118103, rewards: -70, count: 70
success rate:  0.36
epoch: 16639, loss: 0.9175549149513245, rewards: -91, count: 82
success rate:  0.34
epoch: 16649, loss: 0.9665467739105225, rewards: -106, count: 97
success rate:  0.35
epoch: 16659, loss: 1.4776396751403809, rewards: -105, count: 105
success rate:  0.36
epoch: 16669, loss: 0.88216233253479, rewards: -107, count: 107
success rate:  0.35
epoch: 16679, loss: 0.6892664432525635, rewards: -351, count: 297
success rate:  0.33
epoch: 16689, loss: 1.0000280141830444, rewards: -122, count: 113
success rate:  0.35
epoch: 16699, loss: 0.9308764934539795, rewards: -834, count: 762
success rate:  0.38
epoch: 16709, loss: 1.2837421894073486, r

success rate:  0.24
epoch: 17579, loss: 0.8208466172218323, rewards: -280, count: 253
success rate:  0.27
epoch: 17589, loss: 1.6714351177215576, rewards: -107, count: 98
success rate:  0.3
epoch: 17599, loss: 0.9867372512817383, rewards: -226, count: 217
success rate:  0.27
epoch: 17609, loss: 0.8623844385147095, rewards: -180, count: 171
success rate:  0.28
epoch: 17619, loss: 0.9402456283569336, rewards: -348, count: 312
success rate:  0.3
epoch: 17629, loss: 0.8897155523300171, rewards: -56, count: 56
success rate:  0.29
epoch: 17639, loss: 1.387739896774292, rewards: -186, count: 159
success rate:  0.31
epoch: 17649, loss: 0.828249454498291, rewards: -202, count: 193
success rate:  0.34
epoch: 17659, loss: 0.6320347785949707, rewards: -273, count: 255
success rate:  0.32
epoch: 17669, loss: 0.9640348553657532, rewards: -441, count: 396
success rate:  0.33
epoch: 17679, loss: 1.1724809408187866, rewards: -108, count: 108
success rate:  0.31
epoch: 17689, loss: 0.737533450126648, re

success rate:  0.45
epoch: 18559, loss: 0.8426629900932312, rewards: -68, count: 59
success rate:  0.44
epoch: 18569, loss: 1.4312596321105957, rewards: -143, count: 143
success rate:  0.45
epoch: 18579, loss: 1.074349045753479, rewards: -261, count: 216
success rate:  0.43
epoch: 18589, loss: 1.5317163467407227, rewards: -107, count: 107
success rate:  0.44
epoch: 18599, loss: 1.7340006828308105, rewards: -97, count: 88
success rate:  0.41
epoch: 18609, loss: 1.3183574676513672, rewards: -93, count: 84
success rate:  0.45
epoch: 18619, loss: 1.0682376623153687, rewards: -184, count: 175
success rate:  0.42
epoch: 18629, loss: 1.3269575834274292, rewards: -106, count: 97
success rate:  0.4
epoch: 18639, loss: 1.4313024282455444, rewards: -182, count: 173
success rate:  0.38
epoch: 18649, loss: 1.222622036933899, rewards: -110, count: 92
success rate:  0.37
epoch: 18659, loss: 2.070977210998535, rewards: -100, count: 100
success rate:  0.35
epoch: 18669, loss: 0.40863141417503357, rewar

success rate:  0.37
epoch: 19529, loss: 1.3177624940872192, rewards: -180, count: 180
success rate:  0.37
epoch: 19539, loss: 1.259361982345581, rewards: -190, count: 181
success rate:  0.39
epoch: 19549, loss: 0.6485080718994141, rewards: -72, count: 72
success rate:  0.38
epoch: 19559, loss: 1.3882150650024414, rewards: -107, count: 107
success rate:  0.38
epoch: 19569, loss: 1.5109916925430298, rewards: -352, count: 343
success rate:  0.37
epoch: 19579, loss: 0.613266110420227, rewards: -103, count: 94
success rate:  0.36
epoch: 19589, loss: 0.7358388304710388, rewards: -57, count: 57
success rate:  0.35
epoch: 19599, loss: 1.2647912502288818, rewards: -69, count: 69
success rate:  0.34
epoch: 19609, loss: 0.0428193062543869, rewards: -122, count: 113
success rate:  0.34
epoch: 19619, loss: 0.7895804643630981, rewards: -121, count: 112
success rate:  0.35
epoch: 19629, loss: 1.3547627925872803, rewards: -139, count: 121
success rate:  0.39
epoch: 19639, loss: 1.0289360284805298, rew

success rate:  0.3
epoch: 20509, loss: 1.3122540712356567, rewards: -141, count: 123
success rate:  0.31
epoch: 20519, loss: 0.9460748434066772, rewards: -61, count: 61
success rate:  0.32
epoch: 20529, loss: 1.5291805267333984, rewards: -137, count: 137
success rate:  0.33
epoch: 20539, loss: 0.30815064907073975, rewards: -47, count: 47
success rate:  0.33
epoch: 20549, loss: 1.4712891578674316, rewards: -109, count: 109
success rate:  0.33
epoch: 20559, loss: 1.0179768800735474, rewards: -160, count: 151
success rate:  0.38
epoch: 20569, loss: 1.768707036972046, rewards: -85, count: 85
success rate:  0.38
epoch: 20579, loss: 1.103069543838501, rewards: -466, count: 403
success rate:  0.37
epoch: 20589, loss: 0.7458984851837158, rewards: -640, count: 568
success rate:  0.37
epoch: 20599, loss: 0.6343669891357422, rewards: -579, count: 507
success rate:  0.4
epoch: 20609, loss: 0.408154159784317, rewards: -158, count: 149
success rate:  0.4
epoch: 20619, loss: 0.8999629020690918, rewar

success rate:  0.37
epoch: 21479, loss: 0.9302395582199097, rewards: -43, count: 43
success rate:  0.37
epoch: 21489, loss: 1.5874615907669067, rewards: -149, count: 149
success rate:  0.38
epoch: 21499, loss: 1.2251501083374023, rewards: -153, count: 153
success rate:  0.38
epoch: 21509, loss: 0.8905234932899475, rewards: -48, count: 39
success rate:  0.36
epoch: 21519, loss: 0.38913750648498535, rewards: -357, count: 339
success rate:  0.37
epoch: 21529, loss: 1.1293346881866455, rewards: -65, count: 65
success rate:  0.36
epoch: 21539, loss: 0.9532374143600464, rewards: -296, count: 278
success rate:  0.36
epoch: 21549, loss: 1.1852540969848633, rewards: -181, count: 163
success rate:  0.35
epoch: 21559, loss: 1.6087714433670044, rewards: -116, count: 98
success rate:  0.34
epoch: 21569, loss: 1.586419939994812, rewards: -158, count: 140
success rate:  0.29
epoch: 21579, loss: 1.401046872138977, rewards: -119, count: 119
success rate:  0.28
epoch: 21589, loss: 0.8724275827407837, re

success rate:  0.35
epoch: 22459, loss: 0.3816489577293396, rewards: -61, count: 61
success rate:  0.34
epoch: 22469, loss: 0.4073125123977661, rewards: -121, count: 121
success rate:  0.36
epoch: 22479, loss: 0.2521762549877167, rewards: -149, count: 140
success rate:  0.39
epoch: 22489, loss: 1.618857979774475, rewards: -111, count: 93
success rate:  0.38
epoch: 22499, loss: 1.7969844341278076, rewards: -34, count: 34
success rate:  0.4
epoch: 22509, loss: 1.3984485864639282, rewards: -163, count: 145
success rate:  0.4
epoch: 22519, loss: 0.9655923247337341, rewards: -232, count: 205
success rate:  0.38
epoch: 22529, loss: 0.5557373762130737, rewards: -67, count: 67
success rate:  0.37
epoch: 22539, loss: 1.039760947227478, rewards: -323, count: 260
success rate:  0.35
epoch: 22549, loss: 0.8263095021247864, rewards: -543, count: 498
success rate:  0.37
epoch: 22559, loss: 1.1739436388015747, rewards: -145, count: 145
success rate:  0.38
epoch: 22569, loss: 1.0237115621566772, rewar

success rate:  0.45
epoch: 23429, loss: 0.5914166569709778, rewards: -578, count: 524
success rate:  0.45
epoch: 23439, loss: 0.6757221817970276, rewards: -268, count: 250
success rate:  0.43
epoch: 23449, loss: 0.5653713345527649, rewards: -161, count: 134
success rate:  0.43
epoch: 23459, loss: 0.8428164124488831, rewards: -296, count: 251
success rate:  0.4
epoch: 23469, loss: 0.8987991809844971, rewards: -154, count: 145
success rate:  0.37
epoch: 23479, loss: 0.8233839869499207, rewards: -80, count: 80
success rate:  0.34
epoch: 23489, loss: 1.1225565671920776, rewards: -531, count: 504
success rate:  0.33
epoch: 23499, loss: 0.41495198011398315, rewards: -73, count: 55
success rate:  0.35
epoch: 23509, loss: 0.7761713862419128, rewards: -145, count: 136
success rate:  0.34
epoch: 23519, loss: 1.440521478652954, rewards: -94, count: 85
success rate:  0.32
epoch: 23529, loss: 1.6052383184432983, rewards: -205, count: 169
success rate:  0.33
epoch: 23539, loss: 1.4785012006759644, r

success rate:  0.28
epoch: 24399, loss: 0.9995500445365906, rewards: -156, count: 147
success rate:  0.26
epoch: 24409, loss: 1.069340467453003, rewards: -203, count: 194
success rate:  0.27
epoch: 24419, loss: 1.2863231897354126, rewards: -63, count: 63
success rate:  0.3
epoch: 24429, loss: 1.298075795173645, rewards: -83, count: 83
success rate:  0.27
epoch: 24439, loss: 0.7402884364128113, rewards: -290, count: 263
success rate:  0.27
epoch: 24449, loss: 0.9867516756057739, rewards: -121, count: 121
success rate:  0.27
epoch: 24459, loss: 1.2670665979385376, rewards: -362, count: 317
success rate:  0.26
epoch: 24469, loss: 1.509979009628296, rewards: -77, count: 77
success rate:  0.28
epoch: 24479, loss: 1.2130131721496582, rewards: -312, count: 285
success rate:  0.29
epoch: 24489, loss: 1.320859432220459, rewards: -153, count: 144
success rate:  0.32
epoch: 24499, loss: 1.2960801124572754, rewards: -272, count: 236
success rate:  0.38
epoch: 24509, loss: 0.9873666763305664, rewar

success rate:  0.37
epoch: 25369, loss: 0.8852307200431824, rewards: -519, count: 474
success rate:  0.35
epoch: 25379, loss: 0.8548710942268372, rewards: -90, count: 90
success rate:  0.32
epoch: 25389, loss: 0.5027803182601929, rewards: -422, count: 395
success rate:  0.32
epoch: 25399, loss: 0.3777448832988739, rewards: -246, count: 237
success rate:  0.31
epoch: 25409, loss: 1.7145097255706787, rewards: -103, count: 94
success rate:  0.32
epoch: 25419, loss: 0.7310537695884705, rewards: -55, count: 55
success rate:  0.3
epoch: 25429, loss: 0.3334691822528839, rewards: -139, count: 121
success rate:  0.31
epoch: 25439, loss: 1.1109991073608398, rewards: -76, count: 76
success rate:  0.3
epoch: 25449, loss: 0.9999778270721436, rewards: -293, count: 230
success rate:  0.31
epoch: 25459, loss: 1.232162356376648, rewards: -109, count: 109
success rate:  0.32
epoch: 25469, loss: 1.1099672317504883, rewards: -397, count: 361
success rate:  0.33
epoch: 25479, loss: 0.8703339099884033, rewa

success rate:  0.33
epoch: 26349, loss: 1.222043514251709, rewards: -115, count: 115
success rate:  0.35
epoch: 26359, loss: 1.3508214950561523, rewards: -31, count: 31
success rate:  0.34
epoch: 26369, loss: 1.6501832008361816, rewards: -43, count: 43
success rate:  0.37
epoch: 26379, loss: 1.0869711637496948, rewards: -313, count: 304
success rate:  0.37
epoch: 26389, loss: 0.9674827456474304, rewards: -66, count: 66
success rate:  0.36
epoch: 26399, loss: 0.9662282466888428, rewards: -102, count: 102
success rate:  0.37
epoch: 26409, loss: 1.0810658931732178, rewards: -80, count: 80
success rate:  0.35
epoch: 26419, loss: 0.7604392766952515, rewards: -126, count: 108
success rate:  0.36
epoch: 26429, loss: 2.4073574542999268, rewards: -68, count: 59
success rate:  0.37
epoch: 26439, loss: 1.2750383615493774, rewards: -159, count: 150
success rate:  0.36
epoch: 26449, loss: 2.0974738597869873, rewards: -123, count: 123
success rate:  0.39
epoch: 26459, loss: 1.1234370470046997, rewar

success rate:  0.35
epoch: 27329, loss: 1.4474751949310303, rewards: -141, count: 123
success rate:  0.34
epoch: 27339, loss: 0.8216244578361511, rewards: -366, count: 330
success rate:  0.32
epoch: 27349, loss: 1.140742301940918, rewards: -222, count: 186
success rate:  0.28
epoch: 27359, loss: 1.0980849266052246, rewards: -431, count: 368
success rate:  0.28
epoch: 27369, loss: 0.768951952457428, rewards: -59, count: 59
success rate:  0.31
epoch: 27379, loss: 0.9471226334571838, rewards: -251, count: 224
success rate:  0.35
epoch: 27389, loss: 0.6472654342651367, rewards: -33, count: 33
success rate:  0.36
epoch: 27399, loss: 0.5185970067977905, rewards: -114, count: 105
success rate:  0.4
epoch: 27409, loss: 0.34274977445602417, rewards: -26, count: 26
success rate:  0.35
epoch: 27419, loss: 0.8466311097145081, rewards: -342, count: 324
success rate:  0.37
epoch: 27429, loss: 1.2952368259429932, rewards: -200, count: 191
success rate:  0.34
epoch: 27439, loss: 0.55427086353302, rewa

success rate:  0.42
epoch: 28299, loss: 1.0507978200912476, rewards: -91, count: 82
success rate:  0.37
epoch: 28309, loss: 0.8680177330970764, rewards: -141, count: 132
success rate:  0.34
epoch: 28319, loss: 0.9509289264678955, rewards: -520, count: 484
success rate:  0.32
epoch: 28329, loss: 1.0912405252456665, rewards: -221, count: 212
success rate:  0.3
epoch: 28339, loss: 1.1865943670272827, rewards: -71, count: 71
success rate:  0.29
epoch: 28349, loss: 1.5708171129226685, rewards: -106, count: 97
success rate:  0.26
epoch: 28359, loss: 1.0973732471466064, rewards: -580, count: 517
success rate:  0.25
epoch: 28369, loss: 1.3121904134750366, rewards: -113, count: 113
success rate:  0.26
epoch: 28379, loss: 0.5644327998161316, rewards: -368, count: 332
success rate:  0.25
epoch: 28389, loss: 0.7634513974189758, rewards: -124, count: 97
success rate:  0.26
epoch: 28399, loss: 1.0361452102661133, rewards: -235, count: 190
success rate:  0.28
epoch: 28409, loss: 0.6665266752243042, r

success rate:  0.36
epoch: 29259, loss: 0.605168342590332, rewards: -141, count: 132
success rate:  0.35
epoch: 29269, loss: 1.245887279510498, rewards: -82, count: 73
success rate:  0.39
epoch: 29279, loss: 0.7624385356903076, rewards: -189, count: 180
success rate:  0.41
epoch: 29289, loss: 1.0591591596603394, rewards: -66, count: 66
success rate:  0.42
epoch: 29299, loss: 0.9243660569190979, rewards: -94, count: 94
success rate:  0.42
epoch: 29309, loss: 1.0588409900665283, rewards: -88, count: 79
success rate:  0.39
epoch: 29319, loss: 1.489113211631775, rewards: -116, count: 107
success rate:  0.38
epoch: 29329, loss: 0.8645291924476624, rewards: -429, count: 393
success rate:  0.41
epoch: 29339, loss: 0.736434280872345, rewards: -252, count: 234
success rate:  0.43
epoch: 29349, loss: 1.2782986164093018, rewards: -385, count: 358
success rate:  0.41
epoch: 29359, loss: 0.4418007731437683, rewards: -92, count: 83
success rate:  0.45
epoch: 29369, loss: 0.6346950531005859, rewards:

success rate:  0.23
epoch: 30229, loss: 0.44561418890953064, rewards: -136, count: 127
success rate:  0.22
epoch: 30239, loss: 1.4844805002212524, rewards: -40, count: 40
success rate:  0.23
epoch: 30249, loss: 1.095108151435852, rewards: -228, count: 210
success rate:  0.22
epoch: 30259, loss: 1.2843949794769287, rewards: -128, count: 119
success rate:  0.21
epoch: 30269, loss: 1.1563844680786133, rewards: -421, count: 403
success rate:  0.21
epoch: 30279, loss: 1.586374044418335, rewards: -84, count: 84
success rate:  0.18
epoch: 30289, loss: 0.879182755947113, rewards: -457, count: 430
success rate:  0.19
epoch: 30299, loss: 1.3555147647857666, rewards: -167, count: 158
success rate:  0.2
epoch: 30309, loss: 1.368736743927002, rewards: -104, count: 104
success rate:  0.22
epoch: 30319, loss: 0.8557065725326538, rewards: -35, count: 35
success rate:  0.26
epoch: 30329, loss: 1.2276536226272583, rewards: -96, count: 96
success rate:  0.28
epoch: 30339, loss: 1.1612695455551147, reward

success rate:  0.31
epoch: 31209, loss: 0.6214567422866821, rewards: -89, count: 89
success rate:  0.36
epoch: 31219, loss: 1.010432481765747, rewards: -112, count: 112
success rate:  0.34
epoch: 31229, loss: 1.6676145792007446, rewards: -181, count: 172
success rate:  0.34
epoch: 31239, loss: 1.8600596189498901, rewards: -132, count: 132
success rate:  0.34
epoch: 31249, loss: 1.1055649518966675, rewards: -482, count: 428
success rate:  0.34
epoch: 31259, loss: 1.1692099571228027, rewards: -357, count: 339
success rate:  0.34
epoch: 31269, loss: 1.036702036857605, rewards: -209, count: 191
success rate:  0.34
epoch: 31279, loss: 0.8974307775497437, rewards: -684, count: 621
success rate:  0.32
epoch: 31289, loss: 1.40592360496521, rewards: -347, count: 311
success rate:  0.32
epoch: 31299, loss: 1.2798644304275513, rewards: -317, count: 218
success rate:  0.32
epoch: 31309, loss: 1.3023408651351929, rewards: -193, count: 175
success rate:  0.28
epoch: 31319, loss: 1.8142420053482056, 

success rate:  0.28
epoch: 32179, loss: 0.9387195706367493, rewards: -29, count: 29
success rate:  0.28
epoch: 32189, loss: 1.6857149600982666, rewards: -174, count: 156
success rate:  0.24
epoch: 32199, loss: 1.8556379079818726, rewards: -91, count: 82
success rate:  0.22
epoch: 32209, loss: 1.519602656364441, rewards: -221, count: 203
success rate:  0.23
epoch: 32219, loss: 1.852798581123352, rewards: -130, count: 112
success rate:  0.28
epoch: 32229, loss: 1.2009989023208618, rewards: -232, count: 205
success rate:  0.31
epoch: 32239, loss: 0.693520188331604, rewards: -438, count: 402
success rate:  0.3
epoch: 32249, loss: 1.3207623958587646, rewards: -321, count: 294
success rate:  0.31
epoch: 32259, loss: 1.2701936960220337, rewards: -113, count: 95
success rate:  0.31
epoch: 32269, loss: 1.52964448928833, rewards: -68, count: 68
success rate:  0.29
epoch: 32279, loss: 0.6968517303466797, rewards: -742, count: 706
success rate:  0.31
epoch: 32289, loss: 1.086437702178955, rewards:

success rate:  0.45
epoch: 33159, loss: 0.9121255874633789, rewards: -66, count: 57
success rate:  0.42
epoch: 33169, loss: 0.6673782467842102, rewards: -90, count: 81
success rate:  0.38
epoch: 33179, loss: 1.2667351961135864, rewards: -173, count: 155
success rate:  0.4
epoch: 33189, loss: 1.7289588451385498, rewards: -84, count: 84
success rate:  0.4
epoch: 33199, loss: 1.511749267578125, rewards: -112, count: 112
success rate:  0.38
epoch: 33209, loss: 0.8945802450180054, rewards: -558, count: 531
success rate:  0.37
epoch: 33219, loss: 1.2136491537094116, rewards: -157, count: 148
success rate:  0.35
epoch: 33229, loss: 1.4446381330490112, rewards: -103, count: 103
success rate:  0.35
epoch: 33239, loss: 0.7345085740089417, rewards: -97, count: 88
success rate:  0.31
epoch: 33249, loss: 1.1690953969955444, rewards: -127, count: 118
success rate:  0.31
epoch: 33259, loss: 0.6215143203735352, rewards: -302, count: 266
success rate:  0.34
epoch: 33269, loss: 1.034816861152649, reward

success rate:  0.46
epoch: 34129, loss: 1.5872547626495361, rewards: -91, count: 82
success rate:  0.46
epoch: 34139, loss: 1.0742781162261963, rewards: -278, count: 269
success rate:  0.42
epoch: 34149, loss: 1.0996173620224, rewards: -269, count: 242
success rate:  0.37
epoch: 34159, loss: 0.6625987887382507, rewards: -530, count: 476
success rate:  0.39
epoch: 34169, loss: 1.0448765754699707, rewards: -315, count: 279
success rate:  0.39
epoch: 34179, loss: 0.883847713470459, rewards: -218, count: 209
success rate:  0.37
epoch: 34189, loss: 1.000239372253418, rewards: -91, count: 91
success rate:  0.3
epoch: 34199, loss: 0.58319091796875, rewards: -304, count: 277
success rate:  0.27
epoch: 34209, loss: 1.0091482400894165, rewards: -287, count: 251
success rate:  0.25
epoch: 34219, loss: 0.5927916169166565, rewards: -255, count: 237
success rate:  0.3
epoch: 34229, loss: 0.32649511098861694, rewards: -190, count: 163
success rate:  0.26
epoch: 34239, loss: 1.1735678911209106, reward

success rate:  0.32
epoch: 35099, loss: 1.0714092254638672, rewards: -176, count: 140
success rate:  0.3
epoch: 35109, loss: 1.0919389724731445, rewards: -67, count: 67
success rate:  0.3
epoch: 35119, loss: 1.0129808187484741, rewards: -70, count: 70
success rate:  0.29
epoch: 35129, loss: 0.20684286952018738, rewards: -64, count: 64
success rate:  0.31
epoch: 35139, loss: 0.7174884676933289, rewards: -135, count: 126
success rate:  0.32
epoch: 35149, loss: 1.2721014022827148, rewards: -79, count: 79
success rate:  0.34
epoch: 35159, loss: 0.5389075875282288, rewards: -297, count: 261
success rate:  0.36
epoch: 35169, loss: 1.3707275390625, rewards: -166, count: 121
success rate:  0.35
epoch: 35179, loss: 0.7827524542808533, rewards: -334, count: 307
success rate:  0.33
epoch: 35189, loss: 0.8761369585990906, rewards: -167, count: 149
success rate:  0.34
epoch: 35199, loss: 0.8813893795013428, rewards: -276, count: 249
success rate:  0.35
epoch: 35209, loss: 0.9576492309570312, reward

success rate:  0.29
epoch: 36069, loss: 0.6013156175613403, rewards: -283, count: 247
success rate:  0.3
epoch: 36079, loss: 0.7520632147789001, rewards: -590, count: 500
success rate:  0.32
epoch: 36089, loss: 0.4043641686439514, rewards: -64, count: 64
success rate:  0.31
epoch: 36099, loss: 0.6611332297325134, rewards: -139, count: 130
success rate:  0.31
epoch: 36109, loss: 0.7844675183296204, rewards: -453, count: 408
success rate:  0.32
epoch: 36119, loss: 1.7116472721099854, rewards: -115, count: 97
success rate:  0.33
epoch: 36129, loss: 0.6688560843467712, rewards: -248, count: 230
success rate:  0.32
epoch: 36139, loss: 0.30205950140953064, rewards: -368, count: 341
success rate:  0.32
epoch: 36149, loss: 0.8031129240989685, rewards: -296, count: 269
success rate:  0.27
epoch: 36159, loss: 0.4926988184452057, rewards: -340, count: 304
success rate:  0.27
epoch: 36169, loss: 0.7400508522987366, rewards: -405, count: 360
success rate:  0.25
epoch: 36179, loss: 0.874368906021118

success rate:  0.32
epoch: 37029, loss: 0.7086234092712402, rewards: -73, count: 64
success rate:  0.33
epoch: 37039, loss: 1.1532788276672363, rewards: -77, count: 68
success rate:  0.34
epoch: 37049, loss: 0.644592821598053, rewards: -372, count: 354
success rate:  0.35
epoch: 37059, loss: 1.4104653596878052, rewards: -182, count: 173
success rate:  0.32
epoch: 37069, loss: 1.2669533491134644, rewards: -85, count: 76
success rate:  0.31
epoch: 37079, loss: 1.3134554624557495, rewards: -264, count: 255
success rate:  0.31
epoch: 37089, loss: 0.6187441945075989, rewards: -919, count: 757
success rate:  0.27
epoch: 37099, loss: 0.8102185726165771, rewards: -416, count: 371
success rate:  0.25
epoch: 37109, loss: 1.2029788494110107, rewards: -392, count: 356
success rate:  0.26
epoch: 37119, loss: 0.9140632152557373, rewards: -192, count: 174
success rate:  0.29
epoch: 37129, loss: 1.1026307344436646, rewards: -342, count: 342
success rate:  0.31
epoch: 37139, loss: 0.8032897710800171, r

success rate:  0.35
epoch: 37999, loss: 2.40463924407959, rewards: -129, count: 120
success rate:  0.37
epoch: 38009, loss: 0.8527058362960815, rewards: -781, count: 646
success rate:  0.39
epoch: 38019, loss: 0.5344775319099426, rewards: -414, count: 387
success rate:  0.41
epoch: 38029, loss: 1.437599539756775, rewards: -99, count: 99
success rate:  0.42
epoch: 38039, loss: 0.9484978914260864, rewards: -60, count: 60
success rate:  0.34
epoch: 38049, loss: 0.8594363927841187, rewards: -519, count: 492
success rate:  0.31
epoch: 38059, loss: 0.9020060300827026, rewards: -644, count: 563
success rate:  0.33
epoch: 38069, loss: 2.2710180282592773, rewards: -26, count: 26
success rate:  0.31
epoch: 38079, loss: 1.282141089439392, rewards: -122, count: 113
success rate:  0.34
epoch: 38089, loss: 0.944898784160614, rewards: -390, count: 309
success rate:  0.34
epoch: 38099, loss: 1.5414056777954102, rewards: -137, count: 128
success rate:  0.33
epoch: 38109, loss: 1.1448532342910767, rewar

success rate:  0.39
epoch: 38969, loss: 1.2986228466033936, rewards: -112, count: 112
success rate:  0.39
epoch: 38979, loss: 1.8579785823822021, rewards: -78, count: 78
success rate:  0.38
epoch: 38989, loss: 1.7370843887329102, rewards: -176, count: 140
success rate:  0.36
epoch: 38999, loss: 1.0567134618759155, rewards: -355, count: 301
success rate:  0.36
epoch: 39009, loss: 1.0078898668289185, rewards: -376, count: 322
success rate:  0.37
epoch: 39019, loss: 1.4679973125457764, rewards: -113, count: 104
success rate:  0.35
epoch: 39029, loss: 1.5041531324386597, rewards: -30, count: 30
success rate:  0.36
epoch: 39039, loss: 1.6184370517730713, rewards: -94, count: 94
success rate:  0.33
epoch: 39049, loss: 0.36336901783943176, rewards: -143, count: 125
success rate:  0.35
epoch: 39059, loss: 1.9025542736053467, rewards: -209, count: 209
success rate:  0.37
epoch: 39069, loss: 1.284014105796814, rewards: -372, count: 336
success rate:  0.39
epoch: 39079, loss: 1.129320740699768, r

success rate:  0.41
epoch: 39939, loss: 1.0967493057250977, rewards: -507, count: 453
success rate:  0.43
epoch: 39949, loss: 1.621298909187317, rewards: -107, count: 107
success rate:  0.41
epoch: 39959, loss: 0.9636894464492798, rewards: -304, count: 295
success rate:  0.42
epoch: 39969, loss: 1.6160365343093872, rewards: -76, count: 76
success rate:  0.41
epoch: 39979, loss: 1.3626039028167725, rewards: -171, count: 162
success rate:  0.38
epoch: 39989, loss: 1.0130798816680908, rewards: -31, count: 31
success rate:  0.38
epoch: 39999, loss: 1.4715991020202637, rewards: -103, count: 103
success rate:  0.38
epoch: 40009, loss: 1.1693854331970215, rewards: -52, count: 52
success rate:  0.35
epoch: 40019, loss: 1.0562987327575684, rewards: -105, count: 96
success rate:  0.33
epoch: 40029, loss: 1.5194883346557617, rewards: -210, count: 201
success rate:  0.31
epoch: 40039, loss: 1.0498639345169067, rewards: -577, count: 550
success rate:  0.3
epoch: 40049, loss: 0.8403651118278503, rew

success rate:  0.33
epoch: 40909, loss: 1.3210326433181763, rewards: -79, count: 70
success rate:  0.34
epoch: 40919, loss: 0.7949885725975037, rewards: -111, count: 102
success rate:  0.33
epoch: 40929, loss: 1.053525447845459, rewards: -106, count: 97
success rate:  0.35
epoch: 40939, loss: 1.8306818008422852, rewards: -115, count: 115
success rate:  0.34
epoch: 40949, loss: 1.0996824502944946, rewards: -417, count: 345
success rate:  0.35
epoch: 40959, loss: 1.1090822219848633, rewards: -132, count: 132
success rate:  0.35
epoch: 40969, loss: 1.3025293350219727, rewards: -144, count: 126
success rate:  0.33
epoch: 40979, loss: 1.3204013109207153, rewards: -106, count: 79
success rate:  0.31
epoch: 40989, loss: 0.40529313683509827, rewards: -156, count: 147
success rate:  0.3
epoch: 40999, loss: 1.864219307899475, rewards: -51, count: 51
success rate:  0.32
epoch: 41009, loss: 1.9290056228637695, rewards: -204, count: 186
success rate:  0.32
epoch: 41019, loss: 1.4895166158676147, re

success rate:  0.34
epoch: 41879, loss: 1.201853632926941, rewards: -355, count: 337
success rate:  0.33
epoch: 41889, loss: 1.413007378578186, rewards: -151, count: 133
success rate:  0.36
epoch: 41899, loss: 1.2866225242614746, rewards: -68, count: 68
success rate:  0.37
epoch: 41909, loss: 1.475044846534729, rewards: -54, count: 54
success rate:  0.35
epoch: 41919, loss: 0.7128162384033203, rewards: -36, count: 36
success rate:  0.34
epoch: 41929, loss: 0.5509474277496338, rewards: -317, count: 290
success rate:  0.36
epoch: 41939, loss: 1.2355148792266846, rewards: -165, count: 156
success rate:  0.36
epoch: 41949, loss: 0.8175143599510193, rewards: -179, count: 179
success rate:  0.36
epoch: 41959, loss: 1.123572826385498, rewards: -154, count: 154
success rate:  0.37
epoch: 41969, loss: 0.5671707391738892, rewards: -77, count: 77
success rate:  0.37
epoch: 41979, loss: 1.1804440021514893, rewards: -135, count: 135
success rate:  0.38
epoch: 41989, loss: 0.8417167663574219, reward

success rate:  0.3
epoch: 42859, loss: 1.0104342699050903, rewards: -309, count: 291
success rate:  0.3
epoch: 42869, loss: 0.7878120541572571, rewards: -421, count: 394
success rate:  0.35
epoch: 42879, loss: 1.3963639736175537, rewards: -254, count: 227
success rate:  0.38
epoch: 42889, loss: 1.0099735260009766, rewards: -619, count: 565
success rate:  0.39
epoch: 42899, loss: 1.1263489723205566, rewards: -258, count: 240
success rate:  0.36
epoch: 42909, loss: 0.9492623805999756, rewards: -159, count: 150
success rate:  0.34
epoch: 42919, loss: 0.521935224533081, rewards: -436, count: 373
success rate:  0.31
epoch: 42929, loss: 1.3446929454803467, rewards: -154, count: 154
success rate:  0.33
epoch: 42939, loss: 1.4568182229995728, rewards: -180, count: 162
success rate:  0.31
epoch: 42949, loss: 0.959311842918396, rewards: -363, count: 318
success rate:  0.37
epoch: 42959, loss: 0.5343813896179199, rewards: -64, count: 55
success rate:  0.35
epoch: 42969, loss: 1.3502745628356934, 

success rate:  0.33
epoch: 43839, loss: 0.9101473093032837, rewards: -81, count: 81
success rate:  0.34
epoch: 43849, loss: 0.4577184021472931, rewards: -26, count: 26
success rate:  0.36
epoch: 43859, loss: 0.3317152261734009, rewards: -81, count: 81
success rate:  0.39
epoch: 43869, loss: 0.8805786967277527, rewards: -311, count: 293
success rate:  0.44
epoch: 43879, loss: 0.6456827521324158, rewards: -176, count: 149
success rate:  0.44
epoch: 43889, loss: 1.2496789693832397, rewards: -221, count: 194
success rate:  0.44
epoch: 43899, loss: 0.8021458983421326, rewards: -832, count: 742
success rate:  0.42
epoch: 43909, loss: 1.3926618099212646, rewards: -69, count: 60
success rate:  0.42
epoch: 43919, loss: 1.0908012390136719, rewards: -77, count: 77
success rate:  0.43
epoch: 43929, loss: 0.7120423316955566, rewards: -48, count: 48
success rate:  0.42
epoch: 43939, loss: 0.837701141834259, rewards: -23, count: 23
success rate:  0.41
epoch: 43949, loss: 1.2419682741165161, rewards: 

success rate:  0.34
epoch: 44809, loss: 0.8340166807174683, rewards: -468, count: 414
success rate:  0.35
epoch: 44819, loss: 1.1402974128723145, rewards: -168, count: 159
success rate:  0.34
epoch: 44829, loss: 0.5347610116004944, rewards: -412, count: 367
success rate:  0.39
epoch: 44839, loss: 0.8419620394706726, rewards: -234, count: 189
success rate:  0.4
epoch: 44849, loss: 1.25430428981781, rewards: -117, count: 108
success rate:  0.41
epoch: 44859, loss: 0.7718085050582886, rewards: -340, count: 295
success rate:  0.4
epoch: 44869, loss: 1.4281227588653564, rewards: -82, count: 82
success rate:  0.4
epoch: 44879, loss: 0.9869410991668701, rewards: -78, count: 78
success rate:  0.41
epoch: 44889, loss: 0.7133529782295227, rewards: -539, count: 467
success rate:  0.4
epoch: 44899, loss: 0.7760394215583801, rewards: -135, count: 126
success rate:  0.38
epoch: 44909, loss: 0.5473949909210205, rewards: -263, count: 245
success rate:  0.39
epoch: 44919, loss: 1.3942300081253052, rewa

success rate:  0.35
epoch: 45779, loss: 0.9478908181190491, rewards: -771, count: 690
success rate:  0.38
epoch: 45789, loss: 0.7551008462905884, rewards: -204, count: 186
success rate:  0.4
epoch: 45799, loss: 1.1412206888198853, rewards: -384, count: 357
success rate:  0.42
epoch: 45809, loss: 1.3886960744857788, rewards: -202, count: 202
success rate:  0.43
epoch: 45819, loss: 1.3515148162841797, rewards: -133, count: 133
success rate:  0.43
epoch: 45829, loss: 0.9099479913711548, rewards: -120, count: 111
success rate:  0.42
epoch: 45839, loss: 0.7337086796760559, rewards: -687, count: 624
success rate:  0.44
epoch: 45849, loss: 0.9282209277153015, rewards: -85, count: 85
success rate:  0.45
epoch: 45859, loss: 1.0566073656082153, rewards: -108, count: 108
success rate:  0.44
epoch: 45869, loss: 1.706937551498413, rewards: -168, count: 159
success rate:  0.47
epoch: 45879, loss: 1.710080862045288, rewards: -90, count: 90
success rate:  0.47
epoch: 45889, loss: 1.1549793481826782, r

success rate:  0.38
epoch: 46759, loss: 1.19911789894104, rewards: -32, count: 32
success rate:  0.39
epoch: 46769, loss: 0.6733399033546448, rewards: -417, count: 381
success rate:  0.37
epoch: 46779, loss: 1.6508365869522095, rewards: -106, count: 97
success rate:  0.37
epoch: 46789, loss: 0.7029780149459839, rewards: -140, count: 140
success rate:  0.37
epoch: 46799, loss: 1.635443091392517, rewards: -61, count: 61
success rate:  0.4
epoch: 46809, loss: 0.6700221300125122, rewards: -305, count: 269
success rate:  0.44
epoch: 46819, loss: 1.3590763807296753, rewards: -297, count: 297
success rate:  0.43
epoch: 46829, loss: 0.3368340730667114, rewards: -140, count: 140
success rate:  0.47
epoch: 46839, loss: 0.8521682024002075, rewards: -228, count: 210
success rate:  0.5
epoch: 46849, loss: 1.5133956670761108, rewards: -132, count: 114
success rate:  0.5
epoch: 46859, loss: 1.2726494073867798, rewards: -195, count: 159
success rate:  0.49
epoch: 46869, loss: 1.5098844766616821, rewar

success rate:  0.3
epoch: 47739, loss: 1.0848265886306763, rewards: -118, count: 109
success rate:  0.29
epoch: 47749, loss: 1.5497550964355469, rewards: -61, count: 52
success rate:  0.33
epoch: 47759, loss: 2.0103135108947754, rewards: -162, count: 144
success rate:  0.34
epoch: 47769, loss: 1.301572322845459, rewards: -137, count: 128
success rate:  0.37
epoch: 47779, loss: 1.6937243938446045, rewards: -121, count: 121
success rate:  0.39
epoch: 47789, loss: 1.2713226079940796, rewards: -140, count: 131
success rate:  0.42
epoch: 47799, loss: 1.6931012868881226, rewards: -47, count: 47
success rate:  0.39
epoch: 47809, loss: 1.702384352684021, rewards: -72, count: 72
success rate:  0.39
epoch: 47819, loss: 1.304697871208191, rewards: -145, count: 127
success rate:  0.38
epoch: 47829, loss: 1.4801479578018188, rewards: -231, count: 213
success rate:  0.39
epoch: 47839, loss: 0.8791124224662781, rewards: -140, count: 131
success rate:  0.42
epoch: 47849, loss: 1.073789119720459, rewar

success rate:  0.32
epoch: 48709, loss: 1.86930251121521, rewards: -77, count: 59
success rate:  0.33
epoch: 48719, loss: 0.9405225515365601, rewards: -311, count: 284
success rate:  0.33
epoch: 48729, loss: 1.1444334983825684, rewards: -35, count: 35
success rate:  0.34
epoch: 48739, loss: 0.7011727094650269, rewards: -107, count: 89
success rate:  0.33
epoch: 48749, loss: 1.1313399076461792, rewards: -152, count: 152
success rate:  0.33
epoch: 48759, loss: 1.2659022808074951, rewards: -159, count: 132
success rate:  0.32
epoch: 48769, loss: 1.955854058265686, rewards: -134, count: 134
success rate:  0.34
epoch: 48779, loss: 1.1807208061218262, rewards: -160, count: 160
success rate:  0.35
epoch: 48789, loss: 0.9535049200057983, rewards: -66, count: 66
success rate:  0.36
epoch: 48799, loss: 1.1110893487930298, rewards: -394, count: 385
success rate:  0.35
epoch: 48809, loss: 1.68747079372406, rewards: -134, count: 125
success rate:  0.37
epoch: 48819, loss: 1.4569016695022583, reward

success rate:  0.38
epoch: 49689, loss: 1.2610623836517334, rewards: -112, count: 112
success rate:  0.36
epoch: 49699, loss: 0.6340560913085938, rewards: -268, count: 259
success rate:  0.38
epoch: 49709, loss: 1.127767562866211, rewards: -40, count: 40
success rate:  0.39
epoch: 49719, loss: 0.2134244740009308, rewards: -63, count: 63
success rate:  0.37
epoch: 49729, loss: 0.7464843392372131, rewards: -404, count: 359
success rate:  0.39
epoch: 49739, loss: 0.8634244203567505, rewards: -77, count: 77
success rate:  0.38
epoch: 49749, loss: 0.7090489864349365, rewards: -234, count: 216
success rate:  0.35
epoch: 49759, loss: 1.0268807411193848, rewards: -260, count: 233
success rate:  0.38
epoch: 49769, loss: 1.447198510169983, rewards: -95, count: 95
success rate:  0.37
epoch: 49779, loss: 0.3650432825088501, rewards: -135, count: 117
success rate:  0.32
epoch: 49789, loss: 1.4213835000991821, rewards: -145, count: 136
success rate:  0.29
epoch: 49799, loss: 0.7687351703643799, rewa

success rate:  0.31
epoch: 50659, loss: 1.3356623649597168, rewards: -77, count: 77
success rate:  0.32
epoch: 50669, loss: 0.6811732649803162, rewards: -857, count: 776
success rate:  0.33
epoch: 50679, loss: 0.6120616793632507, rewards: -180, count: 144
success rate:  0.32
epoch: 50689, loss: 1.4575613737106323, rewards: -93, count: 93
success rate:  0.33
epoch: 50699, loss: 1.2696571350097656, rewards: -195, count: 168
success rate:  0.35
epoch: 50709, loss: 0.731826663017273, rewards: -407, count: 371
success rate:  0.34
epoch: 50719, loss: 1.0678496360778809, rewards: -171, count: 162
success rate:  0.33
epoch: 50729, loss: 1.3809587955474854, rewards: -195, count: 195
success rate:  0.37
epoch: 50739, loss: 1.2171032428741455, rewards: -86, count: 77
success rate:  0.37
epoch: 50749, loss: 1.1358433961868286, rewards: -66, count: 66
success rate:  0.35
epoch: 50759, loss: 1.556814432144165, rewards: -119, count: 110
success rate:  0.35
epoch: 50769, loss: 1.0296746492385864, rewa

success rate:  0.33
epoch: 51629, loss: 1.1853601932525635, rewards: -87, count: 78
success rate:  0.32
epoch: 51639, loss: 0.5938228368759155, rewards: -806, count: 734
success rate:  0.26
epoch: 51649, loss: 1.301478385925293, rewards: -202, count: 184
success rate:  0.28
epoch: 51659, loss: 0.7967184782028198, rewards: -234, count: 216
success rate:  0.28
epoch: 51669, loss: 1.5874180793762207, rewards: -64, count: 64
success rate:  0.27
epoch: 51679, loss: 0.6283184289932251, rewards: -57, count: 57
success rate:  0.28
epoch: 51689, loss: 0.8880526423454285, rewards: -202, count: 175
success rate:  0.27
epoch: 51699, loss: 0.837984561920166, rewards: -200, count: 191
success rate:  0.27
epoch: 51709, loss: 1.1413788795471191, rewards: -172, count: 163
success rate:  0.25
epoch: 51719, loss: 1.336958885192871, rewards: -80, count: 80
success rate:  0.27
epoch: 51729, loss: 0.945745587348938, rewards: -259, count: 250
success rate:  0.3
epoch: 51739, loss: 1.5528717041015625, rewards

success rate:  0.35
epoch: 52599, loss: 0.7004067301750183, rewards: -210, count: 192
success rate:  0.38
epoch: 52609, loss: 0.5077326893806458, rewards: -39, count: 39
success rate:  0.39
epoch: 52619, loss: 0.8954171538352966, rewards: -123, count: 105
success rate:  0.38
epoch: 52629, loss: 1.0817457437515259, rewards: -243, count: 234
success rate:  0.37
epoch: 52639, loss: 1.1805094480514526, rewards: -377, count: 368
success rate:  0.4
epoch: 52649, loss: 0.47870102524757385, rewards: -40, count: 40
success rate:  0.37
epoch: 52659, loss: 0.7709051370620728, rewards: -224, count: 206
success rate:  0.37
epoch: 52669, loss: 0.8722513914108276, rewards: -58, count: 58
success rate:  0.37
epoch: 52679, loss: 0.7504938840866089, rewards: -52, count: 52
success rate:  0.35
epoch: 52689, loss: 0.5274327397346497, rewards: -595, count: 550
success rate:  0.35
epoch: 52699, loss: 1.100542664527893, rewards: -116, count: 107
success rate:  0.36
epoch: 52709, loss: 1.4792838096618652, rew

success rate:  0.36
epoch: 53579, loss: 1.0758087635040283, rewards: -172, count: 163
success rate:  0.36
epoch: 53589, loss: 1.1974676847457886, rewards: -124, count: 124
success rate:  0.35
epoch: 53599, loss: 0.7098067998886108, rewards: -142, count: 124
success rate:  0.39
epoch: 53609, loss: 1.376468300819397, rewards: -81, count: 81
success rate:  0.39
epoch: 53619, loss: 1.4477530717849731, rewards: -54, count: 54
success rate:  0.37
epoch: 53629, loss: 0.8937253952026367, rewards: -105, count: 105
success rate:  0.41
epoch: 53639, loss: 0.9675199389457703, rewards: -46, count: 46
success rate:  0.38
epoch: 53649, loss: 0.6488017439842224, rewards: -822, count: 732
success rate:  0.37
epoch: 53659, loss: 0.7576723694801331, rewards: -250, count: 241
success rate:  0.34
epoch: 53669, loss: 0.740024209022522, rewards: -137, count: 137
success rate:  0.34
epoch: 53679, loss: 1.3043781518936157, rewards: -118, count: 118
success rate:  0.31
epoch: 53689, loss: 0.850233793258667, rew

success rate:  0.42
epoch: 54559, loss: 1.5402436256408691, rewards: -83, count: 83
success rate:  0.4
epoch: 54569, loss: 1.6813040971755981, rewards: -203, count: 185
success rate:  0.39
epoch: 54579, loss: 1.6121057271957397, rewards: -82, count: 82
success rate:  0.37
epoch: 54589, loss: 1.2297087907791138, rewards: -308, count: 263
success rate:  0.35
epoch: 54599, loss: 0.6872783899307251, rewards: -268, count: 241
success rate:  0.33
epoch: 54609, loss: 0.6046911478042603, rewards: -238, count: 229
success rate:  0.3
epoch: 54619, loss: 0.8631728291511536, rewards: -237, count: 219
success rate:  0.26
epoch: 54629, loss: 1.1655352115631104, rewards: -76, count: 67
success rate:  0.25
epoch: 54639, loss: 1.5490219593048096, rewards: -116, count: 116
success rate:  0.25
epoch: 54649, loss: 1.5544906854629517, rewards: -151, count: 133
success rate:  0.23
epoch: 54659, loss: 0.7912425994873047, rewards: -248, count: 221
success rate:  0.27
epoch: 54669, loss: 0.9103670716285706, re

success rate:  0.37
epoch: 55529, loss: 1.6639539003372192, rewards: -104, count: 104
success rate:  0.35
epoch: 55539, loss: 1.1924185752868652, rewards: -186, count: 168
success rate:  0.34
epoch: 55549, loss: 1.4841912984848022, rewards: -155, count: 146
success rate:  0.33
epoch: 55559, loss: 1.5030196905136108, rewards: -80, count: 80
success rate:  0.29
epoch: 55569, loss: 0.7981448769569397, rewards: -193, count: 157
success rate:  0.28
epoch: 55579, loss: 1.0228499174118042, rewards: -379, count: 325
success rate:  0.27
epoch: 55589, loss: 0.4649258852005005, rewards: -135, count: 117
success rate:  0.3
epoch: 55599, loss: 0.9470508694648743, rewards: -184, count: 175
success rate:  0.28
epoch: 55609, loss: 1.4479687213897705, rewards: -214, count: 214
success rate:  0.27
epoch: 55619, loss: 1.2921876907348633, rewards: -117, count: 108
success rate:  0.27
epoch: 55629, loss: 1.625387191772461, rewards: -46, count: 37
success rate:  0.3
epoch: 55639, loss: 0.41912564635276794, 

success rate:  0.29
epoch: 56509, loss: 0.7025762796401978, rewards: -672, count: 609
success rate:  0.31
epoch: 56519, loss: 1.1935683488845825, rewards: -150, count: 132
success rate:  0.33
epoch: 56529, loss: 0.7902360558509827, rewards: -644, count: 590
success rate:  0.33
epoch: 56539, loss: 0.8383123874664307, rewards: -261, count: 234
success rate:  0.35
epoch: 56549, loss: 0.46015533804893494, rewards: -269, count: 242
success rate:  0.34
epoch: 56559, loss: 1.3637077808380127, rewards: -99, count: 99
success rate:  0.34
epoch: 56569, loss: 1.663449764251709, rewards: -135, count: 126
success rate:  0.34
epoch: 56579, loss: 1.4246716499328613, rewards: -111, count: 111
success rate:  0.36
epoch: 56589, loss: 1.353926181793213, rewards: -330, count: 321
success rate:  0.36
epoch: 56599, loss: 1.2451682090759277, rewards: -123, count: 123
success rate:  0.4
epoch: 56609, loss: 1.4315953254699707, rewards: -369, count: 360
success rate:  0.38
epoch: 56619, loss: 1.015368938446045,

success rate:  0.39
epoch: 57479, loss: 0.35555407404899597, rewards: -39, count: 39
success rate:  0.39
epoch: 57489, loss: 0.844184398651123, rewards: -179, count: 143
success rate:  0.4
epoch: 57499, loss: 1.317417860031128, rewards: -137, count: 110
success rate:  0.39
epoch: 57509, loss: 1.5211800336837769, rewards: -83, count: 47
success rate:  0.41
epoch: 57519, loss: 0.9367952346801758, rewards: -46, count: 37
success rate:  0.42
epoch: 57529, loss: 0.7033505439758301, rewards: -708, count: 654
success rate:  0.41
epoch: 57539, loss: 0.3205301761627197, rewards: -269, count: 251
success rate:  0.41
epoch: 57549, loss: 1.10520339012146, rewards: -64, count: 64
success rate:  0.42
epoch: 57559, loss: 0.9871290326118469, rewards: -345, count: 336
success rate:  0.4
epoch: 57569, loss: 0.9500797986984253, rewards: -196, count: 196
success rate:  0.41
epoch: 57579, loss: 0.9925543665885925, rewards: -202, count: 184
success rate:  0.41
epoch: 57589, loss: 0.890547513961792, rewards:

success rate:  0.3
epoch: 58449, loss: 0.8198046088218689, rewards: -476, count: 422
success rate:  0.29
epoch: 58459, loss: 0.6975078582763672, rewards: -508, count: 481
success rate:  0.27
epoch: 58469, loss: 0.6658869385719299, rewards: -358, count: 322
success rate:  0.29
epoch: 58479, loss: 0.693324863910675, rewards: -324, count: 297
success rate:  0.32
epoch: 58489, loss: 0.6066444516181946, rewards: -85, count: 85
success rate:  0.3
epoch: 58499, loss: 1.2339813709259033, rewards: -193, count: 166
success rate:  0.31
epoch: 58509, loss: 1.8168715238571167, rewards: -56, count: 56
success rate:  0.32
epoch: 58519, loss: 1.0626747608184814, rewards: -167, count: 158
success rate:  0.35
epoch: 58529, loss: 0.5215069651603699, rewards: -321, count: 294
success rate:  0.39
epoch: 58539, loss: 1.0044063329696655, rewards: -200, count: 191
success rate:  0.38
epoch: 58549, loss: 0.33906716108322144, rewards: -101, count: 92
success rate:  0.38
epoch: 58559, loss: 0.8556857109069824, r

success rate:  0.28
epoch: 59409, loss: 0.9994964599609375, rewards: -194, count: 185
success rate:  0.28
epoch: 59419, loss: 0.39606085419654846, rewards: -415, count: 388
success rate:  0.3
epoch: 59429, loss: 0.9608153104782104, rewards: -118, count: 118
success rate:  0.31
epoch: 59439, loss: 1.8925482034683228, rewards: -97, count: 79
success rate:  0.36
epoch: 59449, loss: 0.899405300617218, rewards: -75, count: 75
success rate:  0.37
epoch: 59459, loss: 0.5846131443977356, rewards: -409, count: 355
success rate:  0.37
epoch: 59469, loss: 1.121305227279663, rewards: -78, count: 78
success rate:  0.39
epoch: 59479, loss: 0.6087697148323059, rewards: -411, count: 357
success rate:  0.39
epoch: 59489, loss: 1.7530521154403687, rewards: -165, count: 147
success rate:  0.4
epoch: 59499, loss: 1.7396045923233032, rewards: -95, count: 95
success rate:  0.42
epoch: 59509, loss: 1.1914814710617065, rewards: -91, count: 91
success rate:  0.42
epoch: 59519, loss: 1.3687812089920044, rewards

success rate:  0.37
epoch: 60379, loss: 1.1629343032836914, rewards: -422, count: 395
success rate:  0.37
epoch: 60389, loss: 1.167075753211975, rewards: -686, count: 596
success rate:  0.39
epoch: 60399, loss: 0.9099293351173401, rewards: -86, count: 86
success rate:  0.34
epoch: 60409, loss: 1.2338722944259644, rewards: -308, count: 263
success rate:  0.37
epoch: 60419, loss: 1.2593903541564941, rewards: -645, count: 591
success rate:  0.39
epoch: 60429, loss: 1.552449345588684, rewards: -130, count: 130
success rate:  0.37
epoch: 60439, loss: 1.5579544305801392, rewards: -105, count: 96
success rate:  0.38
epoch: 60449, loss: 1.6394942998886108, rewards: -130, count: 130
success rate:  0.39
epoch: 60459, loss: 1.2097821235656738, rewards: -169, count: 169
success rate:  0.42
epoch: 60469, loss: 0.9693484902381897, rewards: -343, count: 298
success rate:  0.43
epoch: 60479, loss: 1.9152932167053223, rewards: -146, count: 146
success rate:  0.43
epoch: 60489, loss: 1.5211527347564697,

success rate:  0.4
epoch: 61349, loss: 1.1374452114105225, rewards: -93, count: 93
success rate:  0.35
epoch: 61359, loss: 1.16612708568573, rewards: -95, count: 86
success rate:  0.33
epoch: 61369, loss: 1.1446834802627563, rewards: -251, count: 233
success rate:  0.32
epoch: 61379, loss: 0.7341775894165039, rewards: -413, count: 359
success rate:  0.32
epoch: 61389, loss: 0.8960113525390625, rewards: -85, count: 85
success rate:  0.34
epoch: 61399, loss: 0.257267564535141, rewards: -69, count: 69
success rate:  0.35
epoch: 61409, loss: 0.6493131518363953, rewards: -630, count: 576
success rate:  0.36
epoch: 61419, loss: 1.105204701423645, rewards: -102, count: 102
success rate:  0.33
epoch: 61429, loss: 0.9007470607757568, rewards: -202, count: 193
success rate:  0.3
epoch: 61439, loss: 1.2559630870819092, rewards: -234, count: 216
success rate:  0.3
epoch: 61449, loss: 0.872431755065918, rewards: -240, count: 231
success rate:  0.32
epoch: 61459, loss: 1.445588231086731, rewards: -9

success rate:  0.31
epoch: 62319, loss: 0.9605278968811035, rewards: -438, count: 411
success rate:  0.3
epoch: 62329, loss: 2.389557361602783, rewards: -164, count: 146
success rate:  0.32
epoch: 62339, loss: 1.0101807117462158, rewards: -188, count: 188
success rate:  0.35
epoch: 62349, loss: 1.3732811212539673, rewards: -371, count: 344
success rate:  0.38
epoch: 62359, loss: 1.4282522201538086, rewards: -108, count: 108
success rate:  0.35
epoch: 62369, loss: 0.9329046607017517, rewards: -302, count: 266
success rate:  0.33
epoch: 62379, loss: 1.3407642841339111, rewards: -300, count: 273
success rate:  0.32
epoch: 62389, loss: 1.7212698459625244, rewards: -86, count: 86
success rate:  0.34
epoch: 62399, loss: 0.8338751792907715, rewards: -92, count: 92
success rate:  0.34
epoch: 62409, loss: 1.408470869064331, rewards: -288, count: 270
success rate:  0.39
epoch: 62419, loss: 1.5394161939620972, rewards: -83, count: 83
success rate:  0.4
epoch: 62429, loss: 1.3327358961105347, rewa

success rate:  0.36
epoch: 63289, loss: 0.6526708006858826, rewards: -591, count: 546
success rate:  0.34
epoch: 63299, loss: 1.172656536102295, rewards: -116, count: 80
success rate:  0.34
epoch: 63309, loss: 1.0796629190444946, rewards: -172, count: 145
success rate:  0.35
epoch: 63319, loss: 0.5001351237297058, rewards: -619, count: 547
success rate:  0.32
epoch: 63329, loss: 1.1745364665985107, rewards: -108, count: 99
success rate:  0.35
epoch: 63339, loss: 0.17490002512931824, rewards: -243, count: 234
success rate:  0.36
epoch: 63349, loss: 0.42418113350868225, rewards: -53, count: 53
success rate:  0.35
epoch: 63359, loss: 0.8092323541641235, rewards: -228, count: 201
success rate:  0.35
epoch: 63369, loss: 1.0496635437011719, rewards: -115, count: 115
success rate:  0.37
epoch: 63379, loss: 0.9582884311676025, rewards: -138, count: 129
success rate:  0.36
epoch: 63389, loss: 0.35186195373535156, rewards: -45, count: 36
success rate:  0.36
epoch: 63399, loss: 0.7275748252868652

success rate:  0.4
epoch: 64259, loss: 1.4234689474105835, rewards: -162, count: 153
success rate:  0.39
epoch: 64269, loss: 1.587435007095337, rewards: -261, count: 252
success rate:  0.42
epoch: 64279, loss: 0.8337451219558716, rewards: -120, count: 111
success rate:  0.42
epoch: 64289, loss: 1.5163713693618774, rewards: -175, count: 175
success rate:  0.39
epoch: 64299, loss: 1.1011768579483032, rewards: -173, count: 164
success rate:  0.38
epoch: 64309, loss: 0.14364513754844666, rewards: -171, count: 153
success rate:  0.36
epoch: 64319, loss: 1.6270837783813477, rewards: -96, count: 96
success rate:  0.37
epoch: 64329, loss: 1.51043701171875, rewards: -114, count: 105
success rate:  0.39
epoch: 64339, loss: 1.1690452098846436, rewards: -209, count: 209
success rate:  0.37
epoch: 64349, loss: 1.1720097064971924, rewards: -80, count: 80
success rate:  0.37
epoch: 64359, loss: 0.9304184317588806, rewards: -51, count: 51
success rate:  0.38
epoch: 64369, loss: 1.5204154253005981, rew

success rate:  0.39
epoch: 65229, loss: 0.8118214011192322, rewards: -53, count: 53
success rate:  0.36
epoch: 65239, loss: 1.3415565490722656, rewards: -147, count: 138
success rate:  0.38
epoch: 65249, loss: 1.010313630104065, rewards: -175, count: 139
success rate:  0.38
epoch: 65259, loss: 0.9324995279312134, rewards: -296, count: 278
success rate:  0.41
epoch: 65269, loss: 1.5494097471237183, rewards: -48, count: 48
success rate:  0.38
epoch: 65279, loss: 0.6483599543571472, rewards: -523, count: 451
success rate:  0.37
epoch: 65289, loss: 0.9543619751930237, rewards: -151, count: 151
success rate:  0.34
epoch: 65299, loss: 1.432456135749817, rewards: -212, count: 194
success rate:  0.35
epoch: 65309, loss: 1.0448328256607056, rewards: -74, count: 65
success rate:  0.32
epoch: 65319, loss: 1.3687492609024048, rewards: -157, count: 148
success rate:  0.33
epoch: 65329, loss: 1.5484768152236938, rewards: -96, count: 87
success rate:  0.33
epoch: 65339, loss: 0.7543407082557678, rewa

success rate:  0.41
epoch: 66209, loss: 0.3805810809135437, rewards: -99, count: 90
success rate:  0.38
epoch: 66219, loss: 1.2595179080963135, rewards: -47, count: 38
success rate:  0.41
epoch: 66229, loss: 0.6354739665985107, rewards: -474, count: 438
success rate:  0.37
epoch: 66239, loss: 1.892004370689392, rewards: -72, count: 63
success rate:  0.37
epoch: 66249, loss: 1.0572177171707153, rewards: -353, count: 335
success rate:  0.36
epoch: 66259, loss: 1.4496592283248901, rewards: -65, count: 65
success rate:  0.33
epoch: 66269, loss: 1.2821941375732422, rewards: -302, count: 293
success rate:  0.35
epoch: 66279, loss: 1.1210899353027344, rewards: -243, count: 225
success rate:  0.36
epoch: 66289, loss: 0.7193204760551453, rewards: -210, count: 192
success rate:  0.34
epoch: 66299, loss: 0.49390092492103577, rewards: -87, count: 87
success rate:  0.33
epoch: 66309, loss: 1.0830134153366089, rewards: -363, count: 345
success rate:  0.36
epoch: 66319, loss: 1.2555036544799805, rewa

success rate:  0.44
epoch: 67189, loss: 0.7655684947967529, rewards: -63, count: 54
success rate:  0.42
epoch: 67199, loss: 0.664470911026001, rewards: -81, count: 72
success rate:  0.42
epoch: 67209, loss: 0.622631847858429, rewards: -367, count: 322
success rate:  0.44
epoch: 67219, loss: 1.2024492025375366, rewards: -152, count: 152
success rate:  0.41
epoch: 67229, loss: 1.120707392692566, rewards: -81, count: 81
success rate:  0.45
epoch: 67239, loss: 0.5976876020431519, rewards: -75, count: 75
success rate:  0.44
epoch: 67249, loss: 1.1369432210922241, rewards: -61, count: 61
success rate:  0.46
epoch: 67259, loss: 1.0634510517120361, rewards: -152, count: 152
success rate:  0.44
epoch: 67269, loss: 1.0878506898880005, rewards: -169, count: 151
success rate:  0.42
epoch: 67279, loss: 0.5454657077789307, rewards: -190, count: 181
success rate:  0.4
epoch: 67289, loss: 0.6314037442207336, rewards: -328, count: 310
success rate:  0.4
epoch: 67299, loss: 1.2149567604064941, rewards: 

success rate:  0.33
epoch: 68159, loss: 0.9297579526901245, rewards: -394, count: 376
success rate:  0.33
epoch: 68169, loss: 0.9247192144393921, rewards: -317, count: 290
success rate:  0.33
epoch: 68179, loss: 0.9848244786262512, rewards: -487, count: 424
success rate:  0.33
epoch: 68189, loss: 0.8668966293334961, rewards: -79, count: 79
success rate:  0.34
epoch: 68199, loss: 1.882578730583191, rewards: -78, count: 60
success rate:  0.35
epoch: 68209, loss: 1.211491346359253, rewards: -219, count: 201
success rate:  0.34
epoch: 68219, loss: 1.0851211547851562, rewards: -168, count: 141
success rate:  0.38
epoch: 68229, loss: 0.8192147612571716, rewards: -47, count: 47
success rate:  0.42
epoch: 68239, loss: 1.2936725616455078, rewards: -56, count: 56
success rate:  0.42
epoch: 68249, loss: 0.6908413171768188, rewards: -60, count: 60
success rate:  0.39
epoch: 68259, loss: 1.1787959337234497, rewards: -208, count: 181
success rate:  0.39
epoch: 68269, loss: 1.0068529844284058, reward

success rate:  0.28
epoch: 69129, loss: 1.1888349056243896, rewards: -87, count: 78
success rate:  0.28
epoch: 69139, loss: 0.4523008167743683, rewards: -454, count: 427
success rate:  0.27
epoch: 69149, loss: 0.7286129593849182, rewards: -107, count: 98
success rate:  0.28
epoch: 69159, loss: 0.8904353976249695, rewards: -108, count: 108
success rate:  0.3
epoch: 69169, loss: 0.6643310785293579, rewards: -65, count: 65
success rate:  0.29
epoch: 69179, loss: 0.3987506926059723, rewards: -482, count: 455
success rate:  0.29
epoch: 69189, loss: 0.4706515371799469, rewards: -340, count: 313
success rate:  0.29
epoch: 69199, loss: 0.5396567583084106, rewards: -853, count: 799
success rate:  0.29
epoch: 69209, loss: 0.6279484033584595, rewards: -185, count: 167
success rate:  0.29
epoch: 69219, loss: 0.6275453567504883, rewards: -118, count: 109
success rate:  0.28
epoch: 69229, loss: 1.3378980159759521, rewards: -121, count: 103
success rate:  0.27
epoch: 69239, loss: 0.6342113018035889, 

success rate:  0.31
epoch: 70099, loss: 0.9565898180007935, rewards: -362, count: 335
success rate:  0.3
epoch: 70109, loss: 0.5668984651565552, rewards: -299, count: 263
success rate:  0.34
epoch: 70119, loss: 0.6119830012321472, rewards: -48, count: 48
success rate:  0.36
epoch: 70129, loss: 0.9642447829246521, rewards: -103, count: 94
success rate:  0.34
epoch: 70139, loss: 1.0261731147766113, rewards: -301, count: 301
success rate:  0.36
epoch: 70149, loss: 0.820068359375, rewards: -459, count: 423
success rate:  0.36
epoch: 70159, loss: 1.0292679071426392, rewards: -188, count: 179
success rate:  0.35
epoch: 70169, loss: 0.8714306354522705, rewards: -150, count: 141
success rate:  0.35
epoch: 70179, loss: 0.8764936923980713, rewards: -145, count: 127
success rate:  0.37
epoch: 70189, loss: 0.7249723672866821, rewards: -238, count: 220
success rate:  0.38
epoch: 70199, loss: 1.089656114578247, rewards: -184, count: 157
success rate:  0.4
epoch: 70209, loss: 0.9942643046379089, rewa

success rate:  0.34
epoch: 71069, loss: 0.358388215303421, rewards: -283, count: 238
success rate:  0.33
epoch: 71079, loss: 1.3136546611785889, rewards: -151, count: 142
success rate:  0.31
epoch: 71089, loss: 0.3316178023815155, rewards: -104, count: 95
success rate:  0.27
epoch: 71099, loss: 1.8587689399719238, rewards: -50, count: 41
success rate:  0.3
epoch: 71109, loss: 0.7415487170219421, rewards: -253, count: 235
success rate:  0.33
epoch: 71119, loss: 0.9693646430969238, rewards: -591, count: 501
success rate:  0.35
epoch: 71129, loss: 0.7852345705032349, rewards: -549, count: 504
success rate:  0.39
epoch: 71139, loss: 0.9042096138000488, rewards: -94, count: 94
success rate:  0.36
epoch: 71149, loss: 0.5581729412078857, rewards: -982, count: 919
success rate:  0.34
epoch: 71159, loss: 1.335805892944336, rewards: -361, count: 334
success rate:  0.3
epoch: 71169, loss: 1.0172010660171509, rewards: -82, count: 73
success rate:  0.33
epoch: 71179, loss: 1.4657795429229736, rewar

success rate:  0.33
epoch: 72039, loss: 0.6594076752662659, rewards: -24, count: 24
success rate:  0.32
epoch: 72049, loss: 0.637755274772644, rewards: -246, count: 237
success rate:  0.32
epoch: 72059, loss: 1.1671998500823975, rewards: -30, count: 30
success rate:  0.32
epoch: 72069, loss: 0.962296187877655, rewards: -189, count: 180
success rate:  0.33
epoch: 72079, loss: 0.4609187841415405, rewards: -216, count: 198
success rate:  0.35
epoch: 72089, loss: 1.056606650352478, rewards: -75, count: 75
success rate:  0.33
epoch: 72099, loss: 0.9924894571304321, rewards: -238, count: 220
success rate:  0.3
epoch: 72109, loss: 1.1979674100875854, rewards: -213, count: 204
success rate:  0.29
epoch: 72119, loss: 1.1007696390151978, rewards: -51, count: 42
success rate:  0.3
epoch: 72129, loss: 0.9938599467277527, rewards: -59, count: 59
success rate:  0.31
epoch: 72139, loss: 0.956872820854187, rewards: -117, count: 117
success rate:  0.35
epoch: 72149, loss: 0.9271606802940369, rewards: -

success rate:  0.3
epoch: 73009, loss: 0.8951222896575928, rewards: -489, count: 453
success rate:  0.34
epoch: 73019, loss: 0.7219129800796509, rewards: -120, count: 111
success rate:  0.31
epoch: 73029, loss: 0.8605585098266602, rewards: -471, count: 426
success rate:  0.34
epoch: 73039, loss: 0.979947566986084, rewards: -103, count: 103
success rate:  0.34
epoch: 73049, loss: 1.0834342241287231, rewards: -165, count: 156
success rate:  0.31
epoch: 73059, loss: 0.6611975431442261, rewards: -117, count: 99
success rate:  0.32
epoch: 73069, loss: 1.4250924587249756, rewards: -142, count: 142
success rate:  0.3
epoch: 73079, loss: 1.2940247058868408, rewards: -162, count: 162
success rate:  0.3
epoch: 73089, loss: 1.2599866390228271, rewards: -133, count: 115
success rate:  0.32
epoch: 73099, loss: 1.9229700565338135, rewards: -114, count: 105
success rate:  0.31
epoch: 73109, loss: 1.0958672761917114, rewards: -304, count: 268
success rate:  0.29
epoch: 73119, loss: 1.2725125551223755,

success rate:  0.37
epoch: 73989, loss: 0.46402183175086975, rewards: -38, count: 38
success rate:  0.39
epoch: 73999, loss: 0.9725519418716431, rewards: -239, count: 194
success rate:  0.38
epoch: 74009, loss: 0.442729651927948, rewards: -50, count: 50
success rate:  0.4
epoch: 74019, loss: 0.4535735845565796, rewards: -49, count: 49
success rate:  0.42
epoch: 74029, loss: 0.6433787941932678, rewards: -124, count: 124
success rate:  0.4
epoch: 74039, loss: 0.899854302406311, rewards: -218, count: 200
success rate:  0.37
epoch: 74049, loss: 0.9236836433410645, rewards: -159, count: 141
success rate:  0.37
epoch: 74059, loss: 0.6719626784324646, rewards: -64, count: 64
success rate:  0.36
epoch: 74069, loss: 0.7609357833862305, rewards: -72, count: 63
success rate:  0.36
epoch: 74079, loss: 0.46734538674354553, rewards: -810, count: 711
success rate:  0.32
epoch: 74089, loss: 0.5075808167457581, rewards: -72, count: 63
success rate:  0.31
epoch: 74099, loss: 0.47875550389289856, rewards

success rate:  0.3
epoch: 74959, loss: 0.8971756100654602, rewards: -101, count: 101
success rate:  0.32
epoch: 74969, loss: 0.9192360639572144, rewards: -144, count: 144
success rate:  0.31
epoch: 74979, loss: 1.0604453086853027, rewards: -74, count: 74
success rate:  0.31
epoch: 74989, loss: 1.4680380821228027, rewards: -128, count: 128
success rate:  0.35
epoch: 74999, loss: 0.7185801863670349, rewards: -94, count: 94
success rate:  0.36
epoch: 75009, loss: 0.9501987099647522, rewards: -135, count: 135
success rate:  0.36
epoch: 75019, loss: 0.7501729130744934, rewards: -246, count: 237
success rate:  0.33
epoch: 75029, loss: 0.761415958404541, rewards: -46, count: 46
success rate:  0.35
epoch: 75039, loss: 0.7097522020339966, rewards: -68, count: 68
success rate:  0.33
epoch: 75049, loss: 0.7854210734367371, rewards: -275, count: 257
success rate:  0.34
epoch: 75059, loss: 0.7063577175140381, rewards: -221, count: 212
success rate:  0.34
epoch: 75069, loss: 0.5573062896728516, rewa

success rate:  0.26
epoch: 75929, loss: 0.3762151896953583, rewards: -150, count: 141
success rate:  0.25
epoch: 75939, loss: 0.37202322483062744, rewards: -191, count: 173
success rate:  0.26
epoch: 75949, loss: 0.9821435809135437, rewards: -152, count: 152
success rate:  0.28
epoch: 75959, loss: 0.6732373237609863, rewards: -107, count: 107
success rate:  0.28
epoch: 75969, loss: 1.0391380786895752, rewards: -120, count: 102
success rate:  0.31
epoch: 75979, loss: 0.8539122939109802, rewards: -315, count: 297
success rate:  0.28
epoch: 75989, loss: 1.043973684310913, rewards: -155, count: 137
success rate:  0.28
epoch: 75999, loss: 0.5262287855148315, rewards: -997, count: 889
success rate:  0.28
epoch: 76009, loss: 0.800308108329773, rewards: -174, count: 165
success rate:  0.32
epoch: 76019, loss: 0.5294593572616577, rewards: -432, count: 378
success rate:  0.29
epoch: 76029, loss: 0.41485199332237244, rewards: -274, count: 247
success rate:  0.29
epoch: 76039, loss: 0.507206082344

success rate:  0.27
epoch: 76899, loss: 1.3433866500854492, rewards: -270, count: 261
success rate:  0.28
epoch: 76909, loss: 0.6377594470977783, rewards: -211, count: 193
success rate:  0.32
epoch: 76919, loss: 0.951576828956604, rewards: -408, count: 372
success rate:  0.3
epoch: 76929, loss: 0.5888202786445618, rewards: -77, count: 77
success rate:  0.32
epoch: 76939, loss: 0.8728760480880737, rewards: -326, count: 281
success rate:  0.35
epoch: 76949, loss: 1.2350401878356934, rewards: -50, count: 50
success rate:  0.36
epoch: 76959, loss: 1.095192790031433, rewards: -177, count: 150
success rate:  0.36
epoch: 76969, loss: 0.26929971575737, rewards: -79, count: 79
success rate:  0.37
epoch: 76979, loss: 1.2394616603851318, rewards: -99, count: 99
success rate:  0.39
epoch: 76989, loss: 1.1223292350769043, rewards: -274, count: 265
success rate:  0.39
epoch: 76999, loss: 1.0219101905822754, rewards: -279, count: 252
success rate:  0.38
epoch: 77009, loss: 1.144565224647522, rewards:

success rate:  0.37
epoch: 77879, loss: 0.6475138664245605, rewards: -329, count: 302
success rate:  0.41
epoch: 77889, loss: 0.8532415628433228, rewards: -59, count: 59
success rate:  0.38
epoch: 77899, loss: 0.9747870564460754, rewards: -243, count: 225
success rate:  0.4
epoch: 77909, loss: 1.4281243085861206, rewards: -77, count: 77
success rate:  0.38
epoch: 77919, loss: 0.9450827240943909, rewards: -144, count: 135
success rate:  0.38
epoch: 77929, loss: 0.7257780432701111, rewards: -303, count: 276
success rate:  0.35
epoch: 77939, loss: 0.4258364140987396, rewards: -147, count: 138
success rate:  0.37
epoch: 77949, loss: 1.1681360006332397, rewards: -77, count: 77
success rate:  0.35
epoch: 77959, loss: 0.7588865756988525, rewards: -450, count: 441
success rate:  0.33
epoch: 77969, loss: 0.9381178617477417, rewards: -60, count: 60
success rate:  0.32
epoch: 77979, loss: 0.760502815246582, rewards: -185, count: 167
success rate:  0.26
epoch: 77989, loss: 0.7021973133087158, rewa

success rate:  0.37
epoch: 78859, loss: 1.1586865186691284, rewards: -168, count: 150
success rate:  0.39
epoch: 78869, loss: 0.7404400706291199, rewards: -147, count: 147
success rate:  0.4
epoch: 78879, loss: 1.1384646892547607, rewards: -95, count: 86
success rate:  0.42
epoch: 78889, loss: 0.6821076273918152, rewards: -206, count: 188
success rate:  0.4
epoch: 78899, loss: 0.4967305362224579, rewards: -191, count: 164
success rate:  0.43
epoch: 78909, loss: 1.105448603630066, rewards: -121, count: 121
success rate:  0.41
epoch: 78919, loss: 0.528706967830658, rewards: -320, count: 284
success rate:  0.4
epoch: 78929, loss: 0.8095289468765259, rewards: -324, count: 297
success rate:  0.44
epoch: 78939, loss: 0.6182146668434143, rewards: -93, count: 93
success rate:  0.43
epoch: 78949, loss: 0.06267260760068893, rewards: -48, count: 48
success rate:  0.43
epoch: 78959, loss: 0.7138842344284058, rewards: -414, count: 378
success rate:  0.41
epoch: 78969, loss: 0.6854246258735657, rewa

success rate:  0.39
epoch: 79829, loss: 0.7731902003288269, rewards: -667, count: 604
success rate:  0.4
epoch: 79839, loss: 0.7449095845222473, rewards: -59, count: 59
success rate:  0.41
epoch: 79849, loss: 0.5994634628295898, rewards: -207, count: 189
success rate:  0.41
epoch: 79859, loss: 0.8593544363975525, rewards: -185, count: 176
success rate:  0.42
epoch: 79869, loss: 0.5169943571090698, rewards: -90, count: 81
success rate:  0.42
epoch: 79879, loss: 0.2995877265930176, rewards: -432, count: 396
success rate:  0.45
epoch: 79889, loss: 0.6502615809440613, rewards: -237, count: 219
success rate:  0.43
epoch: 79899, loss: 1.1297423839569092, rewards: -41, count: 41
success rate:  0.4
epoch: 79909, loss: 0.774606466293335, rewards: -230, count: 185
success rate:  0.38
epoch: 79919, loss: 1.0720162391662598, rewards: -130, count: 121
success rate:  0.35
epoch: 79929, loss: 0.8157274723052979, rewards: -147, count: 138
success rate:  0.32
epoch: 79939, loss: 0.576278805732727, rewa

success rate:  0.33
epoch: 80799, loss: 0.8498528599739075, rewards: -68, count: 68
success rate:  0.35
epoch: 80809, loss: 0.8856226801872253, rewards: -371, count: 335
success rate:  0.32
epoch: 80819, loss: 0.4751558005809784, rewards: -796, count: 706
success rate:  0.28
epoch: 80829, loss: 0.9087003469467163, rewards: -64, count: 64
success rate:  0.29
epoch: 80839, loss: 0.875274658203125, rewards: -91, count: 91
success rate:  0.32
epoch: 80849, loss: 0.436004638671875, rewards: -264, count: 237
success rate:  0.3
epoch: 80859, loss: 0.5056328177452087, rewards: -382, count: 355
success rate:  0.3
epoch: 80869, loss: 0.9054087996482849, rewards: -250, count: 214
success rate:  0.34
epoch: 80879, loss: 0.4078670144081116, rewards: -459, count: 414
success rate:  0.32
epoch: 80889, loss: 0.5663380026817322, rewards: -163, count: 154
success rate:  0.28
epoch: 80899, loss: 0.7792538404464722, rewards: -200, count: 191
success rate:  0.29
epoch: 80909, loss: 0.197462797164917, rewar

success rate:  0.25
epoch: 81779, loss: 1.2665656805038452, rewards: -235, count: 226
success rate:  0.29
epoch: 81789, loss: 0.9127033352851868, rewards: -597, count: 507
success rate:  0.3
epoch: 81799, loss: 0.3663599193096161, rewards: -311, count: 275
success rate:  0.31
epoch: 81809, loss: 0.6914476752281189, rewards: -61, count: 61
success rate:  0.36
epoch: 81819, loss: 1.5842560529708862, rewards: -66, count: 66
success rate:  0.34
epoch: 81829, loss: 1.44783616065979, rewards: -227, count: 209
success rate:  0.39
epoch: 81839, loss: 0.9954606890678406, rewards: -88, count: 79
success rate:  0.43
epoch: 81849, loss: 1.375859260559082, rewards: -301, count: 265
success rate:  0.43
epoch: 81859, loss: 0.9259843826293945, rewards: -478, count: 433
success rate:  0.45
epoch: 81869, loss: 1.5716415643692017, rewards: -160, count: 160
success rate:  0.46
epoch: 81879, loss: 0.9211603999137878, rewards: -107, count: 107
success rate:  0.42
epoch: 81889, loss: 0.8439839482307434, rewa

success rate:  0.38
epoch: 82759, loss: 1.372976303100586, rewards: -120, count: 111
success rate:  0.38
epoch: 82769, loss: 1.5034356117248535, rewards: -82, count: 64
success rate:  0.36
epoch: 82779, loss: 1.3633719682693481, rewards: -68, count: 68
success rate:  0.39
epoch: 82789, loss: 1.0881961584091187, rewards: -99, count: 81
success rate:  0.41
epoch: 82799, loss: 0.8727658987045288, rewards: -130, count: 130
success rate:  0.44
epoch: 82809, loss: 1.2359617948532104, rewards: -92, count: 92
success rate:  0.39
epoch: 82819, loss: 0.6282806992530823, rewards: -274, count: 256
success rate:  0.36
epoch: 82829, loss: 0.7527175545692444, rewards: -216, count: 198
success rate:  0.34
epoch: 82839, loss: 0.6967740654945374, rewards: -138, count: 129
success rate:  0.34
epoch: 82849, loss: 0.33634766936302185, rewards: -76, count: 76
success rate:  0.32
epoch: 82859, loss: 1.3301401138305664, rewards: -125, count: 98
success rate:  0.3
epoch: 82869, loss: 1.1059317588806152, reward

success rate:  0.39
epoch: 83739, loss: 1.14900803565979, rewards: -413, count: 377
success rate:  0.38
epoch: 83749, loss: 1.102966547012329, rewards: -617, count: 518
success rate:  0.39
epoch: 83759, loss: 0.735079288482666, rewards: -163, count: 154
success rate:  0.37
epoch: 83769, loss: 1.9066858291625977, rewards: -200, count: 182
success rate:  0.34
epoch: 83779, loss: 1.2319949865341187, rewards: -167, count: 149
success rate:  0.36
epoch: 83789, loss: 1.1041384935379028, rewards: -233, count: 215
success rate:  0.36
epoch: 83799, loss: 1.267455816268921, rewards: -113, count: 104
success rate:  0.36
epoch: 83809, loss: 1.186460018157959, rewards: -74, count: 74
success rate:  0.39
epoch: 83819, loss: 1.3304768800735474, rewards: -167, count: 167
success rate:  0.4
epoch: 83829, loss: 0.8263307213783264, rewards: -81, count: 72
success rate:  0.4
epoch: 83839, loss: 1.1763267517089844, rewards: -200, count: 191
success rate:  0.41
epoch: 83849, loss: 1.2882649898529053, reward

success rate:  0.22
epoch: 84709, loss: 0.7661701440811157, rewards: -353, count: 326
success rate:  0.24
epoch: 84719, loss: 0.6857882738113403, rewards: -350, count: 323
success rate:  0.28
epoch: 84729, loss: 1.0503880977630615, rewards: -272, count: 254
success rate:  0.29
epoch: 84739, loss: 0.886831521987915, rewards: -569, count: 452
success rate:  0.29
epoch: 84749, loss: 1.0628987550735474, rewards: -94, count: 85
success rate:  0.31
epoch: 84759, loss: 0.6299302577972412, rewards: -523, count: 478
success rate:  0.29
epoch: 84769, loss: 0.8370616436004639, rewards: -399, count: 372
success rate:  0.29
epoch: 84779, loss: 0.7323014140129089, rewards: -127, count: 127
success rate:  0.31
epoch: 84789, loss: 0.5353876352310181, rewards: -417, count: 381
success rate:  0.32
epoch: 84799, loss: 0.5032435655593872, rewards: -68, count: 68
success rate:  0.33
epoch: 84809, loss: 1.1871272325515747, rewards: -186, count: 168
success rate:  0.31
epoch: 84819, loss: 0.7102453112602234,

success rate:  0.31
epoch: 85679, loss: 1.0649348497390747, rewards: -149, count: 140
success rate:  0.3
epoch: 85689, loss: 0.6891319155693054, rewards: -155, count: 146
success rate:  0.29
epoch: 85699, loss: 1.4799039363861084, rewards: -115, count: 106
success rate:  0.26
epoch: 85709, loss: 0.9069003462791443, rewards: -377, count: 350
success rate:  0.27
epoch: 85719, loss: 0.761482298374176, rewards: -388, count: 352
success rate:  0.32
epoch: 85729, loss: 1.7227747440338135, rewards: -166, count: 148
success rate:  0.32
epoch: 85739, loss: 0.10379644483327866, rewards: -80, count: 80
success rate:  0.33
epoch: 85749, loss: 1.427698016166687, rewards: -96, count: 78
success rate:  0.32
epoch: 85759, loss: 0.9396582841873169, rewards: -257, count: 230
success rate:  0.36
epoch: 85769, loss: 0.9441515803337097, rewards: -40, count: 40
success rate:  0.35
epoch: 85779, loss: 0.9439371824264526, rewards: -660, count: 615
success rate:  0.36
epoch: 85789, loss: 1.4196254014968872, re

success rate:  0.35
epoch: 86659, loss: 1.3788001537322998, rewards: -98, count: 89
success rate:  0.35
epoch: 86669, loss: 1.7006680965423584, rewards: -61, count: 61
success rate:  0.32
epoch: 86679, loss: 1.3259010314941406, rewards: -325, count: 316
success rate:  0.3
epoch: 86689, loss: 1.1330204010009766, rewards: -107, count: 107
success rate:  0.31
epoch: 86699, loss: 0.7156070470809937, rewards: -69, count: 69
success rate:  0.32
epoch: 86709, loss: 0.7257282137870789, rewards: -803, count: 704
success rate:  0.33
epoch: 86719, loss: 1.319713830947876, rewards: -405, count: 369
success rate:  0.34
epoch: 86729, loss: 1.0821183919906616, rewards: -110, count: 110
success rate:  0.35
epoch: 86739, loss: 1.1275099515914917, rewards: -615, count: 579
success rate:  0.38
epoch: 86749, loss: 0.8442850708961487, rewards: -53, count: 53
success rate:  0.4
epoch: 86759, loss: 0.7646921277046204, rewards: -590, count: 545
success rate:  0.4
epoch: 86769, loss: 1.340248942375183, rewards

success rate:  0.36
epoch: 87629, loss: 0.6248052716255188, rewards: -681, count: 564
success rate:  0.4
epoch: 87639, loss: 0.752484142780304, rewards: -61, count: 61
success rate:  0.39
epoch: 87649, loss: 1.2793896198272705, rewards: -259, count: 241
success rate:  0.36
epoch: 87659, loss: 1.470067024230957, rewards: -143, count: 134
success rate:  0.33
epoch: 87669, loss: 0.7864235639572144, rewards: -827, count: 782
success rate:  0.3
epoch: 87679, loss: 0.900142252445221, rewards: -86, count: 86
success rate:  0.32
epoch: 87689, loss: 1.3526698350906372, rewards: -172, count: 154
success rate:  0.32
epoch: 87699, loss: 0.659409761428833, rewards: -130, count: 130
success rate:  0.36
epoch: 87709, loss: 0.9863985776901245, rewards: -161, count: 152
success rate:  0.36
epoch: 87719, loss: 1.8925249576568604, rewards: -68, count: 68
success rate:  0.39
epoch: 87729, loss: 1.291269063949585, rewards: -74, count: 74
success rate:  0.36
epoch: 87739, loss: 1.0322294235229492, rewards: 

success rate:  0.28
epoch: 88599, loss: 0.7962824702262878, rewards: -683, count: 629
success rate:  0.3
epoch: 88609, loss: 1.3207738399505615, rewards: -102, count: 102
success rate:  0.32
epoch: 88619, loss: 1.4826693534851074, rewards: -88, count: 88
success rate:  0.35
epoch: 88629, loss: 1.628409504890442, rewards: -57, count: 57
success rate:  0.36
epoch: 88639, loss: 1.2732465267181396, rewards: -150, count: 132
success rate:  0.41
epoch: 88649, loss: 1.4193412065505981, rewards: -154, count: 154
success rate:  0.41
epoch: 88659, loss: 1.0770435333251953, rewards: -435, count: 399
success rate:  0.41
epoch: 88669, loss: 1.142141342163086, rewards: -229, count: 220
success rate:  0.4
epoch: 88679, loss: 1.6078262329101562, rewards: -68, count: 68
success rate:  0.41
epoch: 88689, loss: 0.950604259967804, rewards: -165, count: 147
success rate:  0.41
epoch: 88699, loss: 1.0094009637832642, rewards: -89, count: 89
success rate:  0.42
epoch: 88709, loss: 1.7064228057861328, rewards

success rate:  0.31
epoch: 89579, loss: 1.689642310142517, rewards: -110, count: 101
success rate:  0.29
epoch: 89589, loss: 1.3975331783294678, rewards: -107, count: 98
success rate:  0.27
epoch: 89599, loss: 0.5005517601966858, rewards: -229, count: 211
success rate:  0.28
epoch: 89609, loss: 1.096756100654602, rewards: -337, count: 319
success rate:  0.3
epoch: 89619, loss: 0.8324742913246155, rewards: -285, count: 267
success rate:  0.31
epoch: 89629, loss: 0.9054287075996399, rewards: -545, count: 518
success rate:  0.29
epoch: 89639, loss: 0.7243106365203857, rewards: -527, count: 455
success rate:  0.3
epoch: 89649, loss: 1.7218265533447266, rewards: -52, count: 52
success rate:  0.33
epoch: 89659, loss: 0.10299772024154663, rewards: -156, count: 138
success rate:  0.32
epoch: 89669, loss: 0.5304826498031616, rewards: -211, count: 184
success rate:  0.34
epoch: 89679, loss: 0.5637237429618835, rewards: -140, count: 140
success rate:  0.33
epoch: 89689, loss: 1.0891046524047852, 

success rate:  0.28
epoch: 90549, loss: 1.018437385559082, rewards: -82, count: 64
success rate:  0.24
epoch: 90559, loss: 0.5882935523986816, rewards: -142, count: 106
success rate:  0.25
epoch: 90569, loss: 1.2234687805175781, rewards: -125, count: 125
success rate:  0.28
epoch: 90579, loss: 0.5206944346427917, rewards: -320, count: 293
success rate:  0.3
epoch: 90589, loss: 0.5001462697982788, rewards: -226, count: 208
success rate:  0.29
epoch: 90599, loss: 0.6378130912780762, rewards: -256, count: 238
success rate:  0.29
epoch: 90609, loss: 1.0988017320632935, rewards: -116, count: 116
success rate:  0.29
epoch: 90619, loss: 0.33030641078948975, rewards: -41, count: 41
success rate:  0.3
epoch: 90629, loss: 1.1361150741577148, rewards: -54, count: 54
success rate:  0.32
epoch: 90639, loss: 1.532596230506897, rewards: -120, count: 111
success rate:  0.31
epoch: 90649, loss: 1.930349588394165, rewards: -60, count: 51
success rate:  0.33
epoch: 90659, loss: 1.5954201221466064, reward

success rate:  0.36
epoch: 91519, loss: 1.047004222869873, rewards: -537, count: 501
success rate:  0.4
epoch: 91529, loss: 1.3464804887771606, rewards: -128, count: 128
success rate:  0.42
epoch: 91539, loss: 0.568649411201477, rewards: -134, count: 125
success rate:  0.42
epoch: 91549, loss: 0.5226120352745056, rewards: -260, count: 224
success rate:  0.39
epoch: 91559, loss: 1.8128721714019775, rewards: -147, count: 138
success rate:  0.36
epoch: 91569, loss: 1.1589840650558472, rewards: -285, count: 249
success rate:  0.36
epoch: 91579, loss: 1.3641841411590576, rewards: -151, count: 151
success rate:  0.34
epoch: 91589, loss: 1.1706514358520508, rewards: -51, count: 51
success rate:  0.36
epoch: 91599, loss: 1.719642162322998, rewards: -98, count: 98
success rate:  0.36
epoch: 91609, loss: 0.8373007774353027, rewards: -971, count: 899
success rate:  0.33
epoch: 91619, loss: 0.5250237584114075, rewards: -156, count: 147
success rate:  0.31
epoch: 91629, loss: 1.3442541360855103, re

success rate:  0.35
epoch: 92489, loss: 0.2573951482772827, rewards: -236, count: 218
success rate:  0.34
epoch: 92499, loss: 1.653799057006836, rewards: -240, count: 222
success rate:  0.34
epoch: 92509, loss: 0.9684603214263916, rewards: -114, count: 105
success rate:  0.33
epoch: 92519, loss: 0.6968811750411987, rewards: -302, count: 266
success rate:  0.3
epoch: 92529, loss: 1.1192145347595215, rewards: -59, count: 59
success rate:  0.26
epoch: 92539, loss: 1.6696075201034546, rewards: -111, count: 102
success rate:  0.29
epoch: 92549, loss: 1.6831574440002441, rewards: -205, count: 205
success rate:  0.25
epoch: 92559, loss: 0.6711128950119019, rewards: -416, count: 371
success rate:  0.24
epoch: 92569, loss: 1.6865500211715698, rewards: -147, count: 138
success rate:  0.22
epoch: 92579, loss: 1.0837384462356567, rewards: -87, count: 87
success rate:  0.21
epoch: 92589, loss: 2.1458959579467773, rewards: -97, count: 88
success rate:  0.25
epoch: 92599, loss: 1.718403935432434, rew

success rate:  0.39
epoch: 93469, loss: 1.242058277130127, rewards: -238, count: 229
success rate:  0.39
epoch: 93479, loss: 0.7669895887374878, rewards: -404, count: 368
success rate:  0.38
epoch: 93489, loss: 1.0504909753799438, rewards: -178, count: 160
success rate:  0.35
epoch: 93499, loss: 1.2718226909637451, rewards: -229, count: 211
success rate:  0.35
epoch: 93509, loss: 0.8203348517417908, rewards: -916, count: 817
success rate:  0.31
epoch: 93519, loss: 1.2739125490188599, rewards: -156, count: 147
success rate:  0.31
epoch: 93529, loss: 1.0836738348007202, rewards: -292, count: 283
success rate:  0.33
epoch: 93539, loss: 1.124936819076538, rewards: -418, count: 400
success rate:  0.36
epoch: 93549, loss: 0.5497826337814331, rewards: -312, count: 276
success rate:  0.38
epoch: 93559, loss: 1.5029066801071167, rewards: -46, count: 46
success rate:  0.37
epoch: 93569, loss: 0.9527317881584167, rewards: -190, count: 181
success rate:  0.39
epoch: 93579, loss: 0.9976773858070374

success rate:  0.43
epoch: 94439, loss: 1.128164291381836, rewards: -105, count: 105
success rate:  0.41
epoch: 94449, loss: 1.7337443828582764, rewards: -73, count: 64
success rate:  0.4
epoch: 94459, loss: 1.1837557554244995, rewards: -125, count: 125
success rate:  0.4
epoch: 94469, loss: 0.9435094594955444, rewards: -46, count: 46
success rate:  0.42
epoch: 94479, loss: 1.4293369054794312, rewards: -229, count: 229
success rate:  0.4
epoch: 94489, loss: 1.38020658493042, rewards: -61, count: 61
success rate:  0.4
epoch: 94499, loss: 1.5763975381851196, rewards: -195, count: 168
success rate:  0.39
epoch: 94509, loss: 1.3654152154922485, rewards: -47, count: 38
success rate:  0.42
epoch: 94519, loss: 1.4103745222091675, rewards: -104, count: 86
success rate:  0.39
epoch: 94529, loss: 1.0670197010040283, rewards: -730, count: 631
success rate:  0.36
epoch: 94539, loss: 1.4800089597702026, rewards: -67, count: 58
success rate:  0.36
epoch: 94549, loss: 1.141668677330017, rewards: -345

success rate:  0.33
epoch: 95419, loss: 1.2685492038726807, rewards: -274, count: 247
success rate:  0.36
epoch: 95429, loss: 0.820697546005249, rewards: -202, count: 193
success rate:  0.32
epoch: 95439, loss: 0.6757422685623169, rewards: -189, count: 180
success rate:  0.31
epoch: 95449, loss: 1.829702377319336, rewards: -44, count: 35
success rate:  0.31
epoch: 95459, loss: 0.6788381338119507, rewards: -120, count: 111
success rate:  0.32
epoch: 95469, loss: 1.2595525979995728, rewards: -64, count: 64
success rate:  0.34
epoch: 95479, loss: 0.44186899065971375, rewards: -73, count: 73
success rate:  0.35
epoch: 95489, loss: 1.1588237285614014, rewards: -449, count: 404
success rate:  0.34
epoch: 95499, loss: 0.9364486336708069, rewards: -290, count: 263
success rate:  0.32
epoch: 95509, loss: 1.107477068901062, rewards: -195, count: 177
success rate:  0.32
epoch: 95519, loss: 1.2837178707122803, rewards: -437, count: 419
success rate:  0.29
epoch: 95529, loss: 1.5119919776916504, re

success rate:  0.35
epoch: 96389, loss: 0.3285232484340668, rewards: -50, count: 50
success rate:  0.35
epoch: 96399, loss: 1.2683299779891968, rewards: -125, count: 116
success rate:  0.34
epoch: 96409, loss: 1.63995361328125, rewards: -114, count: 114
success rate:  0.34
epoch: 96419, loss: 1.2498795986175537, rewards: -249, count: 222
success rate:  0.32
epoch: 96429, loss: 0.8647265434265137, rewards: -796, count: 742
success rate:  0.35
epoch: 96439, loss: 0.41523340344429016, rewards: -233, count: 206
success rate:  0.35
epoch: 96449, loss: 0.9772132039070129, rewards: -178, count: 160
success rate:  0.35
epoch: 96459, loss: 0.9043565392494202, rewards: -286, count: 241
success rate:  0.32
epoch: 96469, loss: 1.3824577331542969, rewards: -91, count: 91
success rate:  0.37
epoch: 96479, loss: 0.4388831555843353, rewards: -70, count: 70
success rate:  0.35
epoch: 96489, loss: 1.4351518154144287, rewards: -197, count: 179
success rate:  0.35
epoch: 96499, loss: 1.4576243162155151, r

success rate:  0.38
epoch: 97359, loss: 1.557376742362976, rewards: -92, count: 92
success rate:  0.43
epoch: 97369, loss: 1.7501776218414307, rewards: -67, count: 67
success rate:  0.45
epoch: 97379, loss: 1.2711635828018188, rewards: -161, count: 152
success rate:  0.43
epoch: 97389, loss: 1.4104701280593872, rewards: -169, count: 169
success rate:  0.43
epoch: 97399, loss: 0.9656391143798828, rewards: -103, count: 85
success rate:  0.4
epoch: 97409, loss: 0.8514888882637024, rewards: -367, count: 349
success rate:  0.4
epoch: 97419, loss: 0.8177593350410461, rewards: -41, count: 41
success rate:  0.42
epoch: 97429, loss: 1.0918176174163818, rewards: -581, count: 509
success rate:  0.42
epoch: 97439, loss: 1.3033555746078491, rewards: -135, count: 108
success rate:  0.43
epoch: 97449, loss: 0.9998935461044312, rewards: -399, count: 318
success rate:  0.42
epoch: 97459, loss: 0.8104860782623291, rewards: -224, count: 215
success rate:  0.39
epoch: 97469, loss: 1.3489352464675903, rewa

success rate:  0.31
epoch: 98329, loss: 1.7112548351287842, rewards: -132, count: 132
success rate:  0.32
epoch: 98339, loss: 0.9387406706809998, rewards: -86, count: 86
success rate:  0.3
epoch: 98349, loss: 1.2677290439605713, rewards: -60, count: 60
success rate:  0.29
epoch: 98359, loss: 1.404832363128662, rewards: -112, count: 103
success rate:  0.31
epoch: 98369, loss: 0.9009910225868225, rewards: -561, count: 498
success rate:  0.32
epoch: 98379, loss: 0.7651785612106323, rewards: -265, count: 229
success rate:  0.31
epoch: 98389, loss: 1.5396002531051636, rewards: -133, count: 124
success rate:  0.32
epoch: 98399, loss: 1.1642255783081055, rewards: -360, count: 351
success rate:  0.36
epoch: 98409, loss: 1.465960144996643, rewards: -97, count: 97
success rate:  0.38
epoch: 98419, loss: 1.2165261507034302, rewards: -200, count: 191
success rate:  0.38
epoch: 98429, loss: 0.941392183303833, rewards: -349, count: 313
success rate:  0.36
epoch: 98439, loss: 1.8802263736724854, rewa

success rate:  0.34
epoch: 99299, loss: 1.1772277355194092, rewards: -194, count: 185
success rate:  0.32
epoch: 99309, loss: 1.194532036781311, rewards: -151, count: 133
success rate:  0.33
epoch: 99319, loss: 0.8852246403694153, rewards: -547, count: 511
success rate:  0.3
epoch: 99329, loss: 0.6802870631217957, rewards: -248, count: 194
success rate:  0.35
epoch: 99339, loss: 1.0327162742614746, rewards: -594, count: 540
success rate:  0.35
epoch: 99349, loss: 1.1143022775650024, rewards: -163, count: 154
success rate:  0.35
epoch: 99359, loss: 0.5366661548614502, rewards: -67, count: 67
success rate:  0.35
epoch: 99369, loss: 0.8094348311424255, rewards: -174, count: 156
success rate:  0.36
epoch: 99379, loss: 0.961293637752533, rewards: -245, count: 227
success rate:  0.37
epoch: 99389, loss: 1.5155174732208252, rewards: -149, count: 140
success rate:  0.36
epoch: 99399, loss: 1.3287837505340576, rewards: -114, count: 105
success rate:  0.37
epoch: 99409, loss: 0.17935585975646973

success rate:  0.33
epoch: 100269, loss: 0.9306378364562988, rewards: -490, count: 436
success rate:  0.31
epoch: 100279, loss: 1.397050142288208, rewards: -84, count: 66
success rate:  0.3
epoch: 100289, loss: 0.7973291277885437, rewards: -348, count: 330
success rate:  0.34
epoch: 100299, loss: 1.1481564044952393, rewards: -122, count: 122
success rate:  0.32
epoch: 100309, loss: 0.5842370390892029, rewards: -1153, count: 1000
success rate:  0.29
epoch: 100319, loss: 0.7902718782424927, rewards: -419, count: 383
success rate:  0.28
epoch: 100329, loss: 0.557136595249176, rewards: -155, count: 146
success rate:  0.3
epoch: 100339, loss: 1.0571949481964111, rewards: -118, count: 100
success rate:  0.29
epoch: 100349, loss: 0.42982301115989685, rewards: -29, count: 29
success rate:  0.32
epoch: 100359, loss: 1.387965440750122, rewards: -572, count: 446
success rate:  0.31
epoch: 100369, loss: 0.8517230749130249, rewards: -133, count: 124
success rate:  0.33
epoch: 100379, loss: 1.165257

success rate:  0.35
epoch: 101219, loss: 0.8323126435279846, rewards: -47, count: 47
success rate:  0.36
epoch: 101229, loss: 1.0819305181503296, rewards: -75, count: 75
success rate:  0.37
epoch: 101239, loss: 1.5351841449737549, rewards: -67, count: 58
success rate:  0.34
epoch: 101249, loss: 1.0180050134658813, rewards: -282, count: 255
success rate:  0.32
epoch: 101259, loss: 0.8933843374252319, rewards: -121, count: 121
success rate:  0.3
epoch: 101269, loss: 0.6071575880050659, rewards: -125, count: 116
success rate:  0.34
epoch: 101279, loss: 0.9689405560493469, rewards: -51, count: 51
success rate:  0.32
epoch: 101289, loss: 1.3161582946777344, rewards: -79, count: 79
success rate:  0.3
epoch: 101299, loss: 1.421729326248169, rewards: -279, count: 252
success rate:  0.33
epoch: 101309, loss: 0.9011761546134949, rewards: -118, count: 109
success rate:  0.32
epoch: 101319, loss: 1.8743056058883667, rewards: -102, count: 93
success rate:  0.31
epoch: 101329, loss: 1.21293509006500

success rate:  0.39
epoch: 102189, loss: 1.362727403640747, rewards: -144, count: 126
success rate:  0.35
epoch: 102199, loss: 1.01725435256958, rewards: -275, count: 230
success rate:  0.36
epoch: 102209, loss: 0.9252305626869202, rewards: -602, count: 539
success rate:  0.37
epoch: 102219, loss: 1.2244606018066406, rewards: -72, count: 72
success rate:  0.4
epoch: 102229, loss: 1.0983812808990479, rewards: -271, count: 262
success rate:  0.36
epoch: 102239, loss: 1.428711175918579, rewards: -43, count: 43
success rate:  0.36
epoch: 102249, loss: 1.1700100898742676, rewards: -73, count: 73
success rate:  0.34
epoch: 102259, loss: 1.122112512588501, rewards: -108, count: 108
success rate:  0.32
epoch: 102269, loss: 0.9694520831108093, rewards: -134, count: 125
success rate:  0.3
epoch: 102279, loss: 2.2098217010498047, rewards: -74, count: 47
success rate:  0.31
epoch: 102289, loss: 1.4776058197021484, rewards: -80, count: 71
success rate:  0.32
epoch: 102299, loss: 1.3428192138671875,

success rate:  0.36
epoch: 103149, loss: 1.0501266717910767, rewards: -275, count: 257
success rate:  0.34
epoch: 103159, loss: 1.3864116668701172, rewards: -189, count: 180
success rate:  0.39
epoch: 103169, loss: 0.6628660559654236, rewards: -133, count: 124
success rate:  0.38
epoch: 103179, loss: 1.4234639406204224, rewards: -187, count: 169
success rate:  0.38
epoch: 103189, loss: 1.2467153072357178, rewards: -389, count: 371
success rate:  0.37
epoch: 103199, loss: 1.225175380706787, rewards: -124, count: 124
success rate:  0.39
epoch: 103209, loss: 0.7247787117958069, rewards: -274, count: 256
success rate:  0.39
epoch: 103219, loss: 0.6041397452354431, rewards: -265, count: 229
success rate:  0.39
epoch: 103229, loss: 0.8107072710990906, rewards: -95, count: 86
success rate:  0.38
epoch: 103239, loss: 1.0305105447769165, rewards: -70, count: 70
success rate:  0.37
epoch: 103249, loss: 0.6723641157150269, rewards: -154, count: 118
success rate:  0.37
epoch: 103259, loss: 1.25819

success rate:  0.33
epoch: 104109, loss: 1.3277667760849, rewards: -216, count: 198
success rate:  0.31
epoch: 104119, loss: 1.5884246826171875, rewards: -180, count: 162
success rate:  0.28
epoch: 104129, loss: 0.4593704342842102, rewards: -184, count: 166
success rate:  0.28
epoch: 104139, loss: 1.172129511833191, rewards: -140, count: 122
success rate:  0.23
epoch: 104149, loss: 1.6352558135986328, rewards: -226, count: 190
success rate:  0.23
epoch: 104159, loss: 1.2363090515136719, rewards: -47, count: 47
success rate:  0.26
epoch: 104169, loss: 1.1371623277664185, rewards: -179, count: 170
success rate:  0.3
epoch: 104179, loss: 0.45286139845848083, rewards: -172, count: 154
success rate:  0.34
epoch: 104189, loss: 0.5635186433792114, rewards: -51, count: 51
success rate:  0.33
epoch: 104199, loss: 1.2974601984024048, rewards: -117, count: 117
success rate:  0.37
epoch: 104209, loss: 1.1182676553726196, rewards: -92, count: 92
success rate:  0.4
epoch: 104219, loss: 0.59862083196

success rate:  0.31
epoch: 105069, loss: 2.178386688232422, rewards: -79, count: 61
success rate:  0.33
epoch: 105079, loss: 1.8185421228408813, rewards: -171, count: 171
success rate:  0.3
epoch: 105089, loss: 1.2477799654006958, rewards: -327, count: 291
success rate:  0.32
epoch: 105099, loss: 2.013200521469116, rewards: -48, count: 48
success rate:  0.32
epoch: 105109, loss: 1.1610321998596191, rewards: -141, count: 114
success rate:  0.33
epoch: 105119, loss: 1.0304780006408691, rewards: -98, count: 98
success rate:  0.31
epoch: 105129, loss: 0.615429699420929, rewards: -267, count: 258
success rate:  0.29
epoch: 105139, loss: 1.325935959815979, rewards: -194, count: 167
success rate:  0.27
epoch: 105149, loss: 1.309799313545227, rewards: -102, count: 102
success rate:  0.22
epoch: 105159, loss: 1.85556161403656, rewards: -98, count: 62
success rate:  0.26
epoch: 105169, loss: 0.8672998547554016, rewards: -162, count: 135
success rate:  0.25
epoch: 105179, loss: 0.8329461216926575

success rate:  0.32
epoch: 106039, loss: 1.0588829517364502, rewards: -140, count: 122
success rate:  0.32
epoch: 106049, loss: 0.8178896307945251, rewards: -465, count: 402
success rate:  0.33
epoch: 106059, loss: 1.4104071855545044, rewards: -88, count: 88
success rate:  0.32
epoch: 106069, loss: 1.7472590208053589, rewards: -96, count: 96
success rate:  0.32
epoch: 106079, loss: 1.0532493591308594, rewards: -647, count: 548
success rate:  0.3
epoch: 106089, loss: 1.9481149911880493, rewards: -72, count: 72
success rate:  0.29
epoch: 106099, loss: 1.0938668251037598, rewards: -272, count: 254
success rate:  0.31
epoch: 106109, loss: 0.9576535224914551, rewards: -236, count: 200
success rate:  0.35
epoch: 106119, loss: 1.5647481679916382, rewards: -327, count: 327
success rate:  0.31
epoch: 106129, loss: 1.1534984111785889, rewards: -292, count: 283
success rate:  0.29
epoch: 106139, loss: 0.6668126583099365, rewards: -295, count: 277
success rate:  0.3
epoch: 106149, loss: 0.32291817

success rate:  0.36
epoch: 106999, loss: 1.0819553136825562, rewards: -217, count: 199
success rate:  0.31
epoch: 107009, loss: 0.46739184856414795, rewards: -82, count: 73
success rate:  0.29
epoch: 107019, loss: 1.8607605695724487, rewards: -71, count: 71
success rate:  0.25
epoch: 107029, loss: 0.5191699266433716, rewards: -240, count: 204
success rate:  0.27
epoch: 107039, loss: 0.8415162563323975, rewards: -163, count: 154
success rate:  0.25
epoch: 107049, loss: 0.9655827879905701, rewards: -148, count: 139
success rate:  0.21
epoch: 107059, loss: 0.8603841066360474, rewards: -76, count: 76
success rate:  0.2
epoch: 107069, loss: 0.6994501948356628, rewards: -230, count: 185
success rate:  0.21
epoch: 107079, loss: 1.1313432455062866, rewards: -367, count: 322
success rate:  0.22
epoch: 107089, loss: 1.3948304653167725, rewards: -103, count: 94
success rate:  0.21
epoch: 107099, loss: 0.8997325301170349, rewards: -147, count: 129
success rate:  0.25
epoch: 107109, loss: 1.8417298

success rate:  0.37
epoch: 107969, loss: 0.6227681040763855, rewards: -152, count: 134
success rate:  0.36
epoch: 107979, loss: 0.8033899068832397, rewards: -412, count: 358
success rate:  0.35
epoch: 107989, loss: 0.6362469792366028, rewards: -325, count: 289
success rate:  0.31
epoch: 107999, loss: 0.9685116410255432, rewards: -107, count: 98
success rate:  0.32
epoch: 108009, loss: 1.49716055393219, rewards: -119, count: 119
success rate:  0.32
epoch: 108019, loss: 1.0662986040115356, rewards: -339, count: 312
success rate:  0.31
epoch: 108029, loss: 0.9581794142723083, rewards: -524, count: 461
success rate:  0.31
epoch: 108039, loss: 1.7599713802337646, rewards: -116, count: 107
success rate:  0.29
epoch: 108049, loss: 0.9116427302360535, rewards: -747, count: 666
success rate:  0.29
epoch: 108059, loss: 0.8149430155754089, rewards: -131, count: 113
success rate:  0.29
epoch: 108069, loss: 0.606132447719574, rewards: -479, count: 443
success rate:  0.3
epoch: 108079, loss: 0.54346

success rate:  0.3
epoch: 108939, loss: 1.1135791540145874, rewards: -576, count: 531
success rate:  0.28
epoch: 108949, loss: 0.7412464618682861, rewards: -407, count: 380
success rate:  0.26
epoch: 108959, loss: 1.7789069414138794, rewards: -127, count: 127
success rate:  0.23
epoch: 108969, loss: 0.20257289707660675, rewards: -192, count: 174
success rate:  0.26
epoch: 108979, loss: 1.2009391784667969, rewards: -307, count: 271
success rate:  0.27
epoch: 108989, loss: 1.8001315593719482, rewards: -73, count: 73
success rate:  0.29
epoch: 108999, loss: 1.1089683771133423, rewards: -328, count: 310
success rate:  0.31
epoch: 109009, loss: 1.2158000469207764, rewards: -492, count: 456
success rate:  0.34
epoch: 109019, loss: 0.596522331237793, rewards: -33, count: 33
success rate:  0.35
epoch: 109029, loss: 1.0491564273834229, rewards: -487, count: 433
success rate:  0.34
epoch: 109039, loss: 1.62839674949646, rewards: -134, count: 116
success rate:  0.34
epoch: 109049, loss: 1.1760578

success rate:  0.35
epoch: 109899, loss: 0.7975559234619141, rewards: -432, count: 387
success rate:  0.33
epoch: 109909, loss: 0.7742943167686462, rewards: -487, count: 415
success rate:  0.35
epoch: 109919, loss: 1.0784187316894531, rewards: -105, count: 105
success rate:  0.35
epoch: 109929, loss: 0.8218579292297363, rewards: -1085, count: 995
success rate:  0.39
epoch: 109939, loss: 1.4918349981307983, rewards: -107, count: 107
success rate:  0.34
epoch: 109949, loss: 0.7429212331771851, rewards: -216, count: 207
success rate:  0.3
epoch: 109959, loss: 0.617125391960144, rewards: -300, count: 237
success rate:  0.31
epoch: 109969, loss: 1.055846929550171, rewards: -149, count: 122
success rate:  0.32
epoch: 109979, loss: 2.127509593963623, rewards: -64, count: 55
success rate:  0.32
epoch: 109989, loss: 1.4619412422180176, rewards: -213, count: 204
success rate:  0.37
epoch: 109999, loss: 1.2429234981536865, rewards: -125, count: 125
success rate:  0.41
epoch: 110009, loss: 1.29127

success rate:  0.33
epoch: 110859, loss: 0.30522891879081726, rewards: -134, count: 125
success rate:  0.34
epoch: 110869, loss: 0.8841708302497864, rewards: -171, count: 162
success rate:  0.32
epoch: 110879, loss: 1.3718161582946777, rewards: -231, count: 204
success rate:  0.32
epoch: 110889, loss: 1.7166218757629395, rewards: -289, count: 217
success rate:  0.31
epoch: 110899, loss: 1.0477474927902222, rewards: -199, count: 181
success rate:  0.28
epoch: 110909, loss: 0.9567638039588928, rewards: -759, count: 705
success rate:  0.29
epoch: 110919, loss: 1.4861441850662231, rewards: -292, count: 274
success rate:  0.28
epoch: 110929, loss: 1.2381244897842407, rewards: -122, count: 113
success rate:  0.32
epoch: 110939, loss: 1.1036708354949951, rewards: -737, count: 638
success rate:  0.31
epoch: 110949, loss: 1.7050139904022217, rewards: -188, count: 188
success rate:  0.33
epoch: 110959, loss: 0.7189310789108276, rewards: -117, count: 108
success rate:  0.31
epoch: 110969, loss: 1

success rate:  0.31
epoch: 111819, loss: 1.1948940753936768, rewards: -106, count: 97
success rate:  0.34
epoch: 111829, loss: 0.5133535265922546, rewards: -155, count: 146
success rate:  0.33
epoch: 111839, loss: 1.2289284467697144, rewards: -72, count: 63
success rate:  0.33
epoch: 111849, loss: 1.5655990839004517, rewards: -189, count: 189
success rate:  0.29
epoch: 111859, loss: 1.4356569051742554, rewards: -276, count: 258
success rate:  0.32
epoch: 111869, loss: 2.2325284481048584, rewards: -50, count: 50
success rate:  0.3
epoch: 111879, loss: 1.2916275262832642, rewards: -171, count: 162
success rate:  0.29
epoch: 111889, loss: 1.2798998355865479, rewards: -254, count: 245
success rate:  0.3
epoch: 111899, loss: 1.2309925556182861, rewards: -84, count: 75
success rate:  0.28
epoch: 111909, loss: 2.03753399848938, rewards: -112, count: 103
success rate:  0.32
epoch: 111919, loss: 1.011840581893921, rewards: -195, count: 168
success rate:  0.3
epoch: 111929, loss: 2.7468655109405

success rate:  0.37
epoch: 112789, loss: 0.7643325328826904, rewards: -277, count: 268
success rate:  0.4
epoch: 112799, loss: 1.2079896926879883, rewards: -77, count: 77
success rate:  0.4
epoch: 112809, loss: 0.8327088952064514, rewards: -209, count: 164
success rate:  0.41
epoch: 112819, loss: 1.0583124160766602, rewards: -123, count: 123
success rate:  0.41
epoch: 112829, loss: 1.4214791059494019, rewards: -425, count: 389
success rate:  0.41
epoch: 112839, loss: 0.8005885481834412, rewards: -771, count: 699
success rate:  0.4
epoch: 112849, loss: 0.745827317237854, rewards: -228, count: 210
success rate:  0.43
epoch: 112859, loss: 1.3017466068267822, rewards: -247, count: 229
success rate:  0.45
epoch: 112869, loss: 1.017587423324585, rewards: -344, count: 308
success rate:  0.41
epoch: 112879, loss: 1.1408345699310303, rewards: -67, count: 67
success rate:  0.37
epoch: 112889, loss: 1.4907146692276, rewards: -402, count: 330
success rate:  0.31
epoch: 112899, loss: 1.294889211654

success rate:  0.34
epoch: 113749, loss: 0.8828110098838806, rewards: -990, count: 891
success rate:  0.33
epoch: 113759, loss: 1.822285532951355, rewards: -149, count: 140
success rate:  0.34
epoch: 113769, loss: 0.6875626444816589, rewards: -278, count: 233
success rate:  0.33
epoch: 113779, loss: 1.7649507522583008, rewards: -169, count: 160
success rate:  0.35
epoch: 113789, loss: 1.1357042789459229, rewards: -261, count: 234
success rate:  0.35
epoch: 113799, loss: 0.7977309226989746, rewards: -453, count: 426
success rate:  0.36
epoch: 113809, loss: 1.6974973678588867, rewards: -324, count: 279
success rate:  0.36
epoch: 113819, loss: 2.065729856491089, rewards: -65, count: 56
success rate:  0.33
epoch: 113829, loss: 1.9253793954849243, rewards: -154, count: 136
success rate:  0.31
epoch: 113839, loss: 2.2042250633239746, rewards: -57, count: 57
success rate:  0.31
epoch: 113849, loss: 1.6551706790924072, rewards: -127, count: 127
success rate:  0.31
epoch: 113859, loss: 0.976743

success rate:  0.39
epoch: 114719, loss: 1.7881020307540894, rewards: -124, count: 124
success rate:  0.37
epoch: 114729, loss: 1.0834989547729492, rewards: -53, count: 53
success rate:  0.36
epoch: 114739, loss: 1.330033540725708, rewards: -246, count: 237
success rate:  0.35
epoch: 114749, loss: 0.9249451756477356, rewards: -139, count: 130
success rate:  0.35
epoch: 114759, loss: 1.2978978157043457, rewards: -58, count: 58
success rate:  0.34
epoch: 114769, loss: 1.548537015914917, rewards: -44, count: 44
success rate:  0.32
epoch: 114779, loss: 1.1895089149475098, rewards: -450, count: 405
success rate:  0.31
epoch: 114789, loss: 2.2037978172302246, rewards: -62, count: 62
success rate:  0.28
epoch: 114799, loss: 1.08396577835083, rewards: -332, count: 296
success rate:  0.28
epoch: 114809, loss: 1.6537526845932007, rewards: -181, count: 172
success rate:  0.26
epoch: 114819, loss: 1.785941243171692, rewards: -291, count: 264
success rate:  0.26
epoch: 114829, loss: 0.9669935107231

success rate:  0.4
epoch: 115689, loss: 1.7810395956039429, rewards: -94, count: 85
success rate:  0.41
epoch: 115699, loss: 1.3881577253341675, rewards: -322, count: 286
success rate:  0.39
epoch: 115709, loss: 1.2017347812652588, rewards: -323, count: 305
success rate:  0.39
epoch: 115719, loss: 1.8698601722717285, rewards: -153, count: 144
success rate:  0.38
epoch: 115729, loss: 1.7762939929962158, rewards: -112, count: 112
success rate:  0.36
epoch: 115739, loss: 1.0852926969528198, rewards: -147, count: 138
success rate:  0.37
epoch: 115749, loss: 0.6061798930168152, rewards: -76, count: 76
success rate:  0.38
epoch: 115759, loss: 1.6344728469848633, rewards: -161, count: 143
success rate:  0.35
epoch: 115769, loss: 1.7891463041305542, rewards: -111, count: 102
success rate:  0.34
epoch: 115779, loss: 1.1945953369140625, rewards: -285, count: 267
success rate:  0.31
epoch: 115789, loss: 0.9127563238143921, rewards: -620, count: 512
success rate:  0.3
epoch: 115799, loss: 0.922630

success rate:  0.28
epoch: 116659, loss: 1.2837896347045898, rewards: -437, count: 419
success rate:  0.28
epoch: 116669, loss: 1.2857959270477295, rewards: -247, count: 238
success rate:  0.3
epoch: 116679, loss: 0.7725681662559509, rewards: -98, count: 89
success rate:  0.32
epoch: 116689, loss: 0.7948164343833923, rewards: -499, count: 490
success rate:  0.32
epoch: 116699, loss: 1.4850298166275024, rewards: -97, count: 97
success rate:  0.31
epoch: 116709, loss: 1.9165399074554443, rewards: -89, count: 89
success rate:  0.31
epoch: 116719, loss: 1.7020723819732666, rewards: -233, count: 197
success rate:  0.34
epoch: 116729, loss: 1.5622646808624268, rewards: -67, count: 67
success rate:  0.37
epoch: 116739, loss: 1.488206386566162, rewards: -447, count: 411
success rate:  0.38
epoch: 116749, loss: 1.1510486602783203, rewards: -214, count: 205
success rate:  0.38
epoch: 116759, loss: 1.6570743322372437, rewards: -128, count: 119
success rate:  0.41
epoch: 116769, loss: 1.8384754657

success rate:  0.24
epoch: 117629, loss: 0.7964866161346436, rewards: -471, count: 399
success rate:  0.28
epoch: 117639, loss: 1.5265226364135742, rewards: -111, count: 111
success rate:  0.32
epoch: 117649, loss: 0.651106595993042, rewards: -582, count: 510
success rate:  0.33
epoch: 117659, loss: 1.97079598903656, rewards: -151, count: 124
success rate:  0.36
epoch: 117669, loss: 1.8516675233840942, rewards: -94, count: 94
success rate:  0.35
epoch: 117679, loss: 1.5173768997192383, rewards: -482, count: 464
success rate:  0.35
epoch: 117689, loss: 1.2479610443115234, rewards: -197, count: 188
success rate:  0.4
epoch: 117699, loss: 1.1684463024139404, rewards: -100, count: 100
success rate:  0.43
epoch: 117709, loss: 0.9792636036872864, rewards: -178, count: 169
success rate:  0.44
epoch: 117719, loss: 1.2382161617279053, rewards: -53, count: 53
success rate:  0.46
epoch: 117729, loss: 1.1503281593322754, rewards: -692, count: 575
success rate:  0.42
epoch: 117739, loss: 1.71360266

success rate:  0.42
epoch: 118589, loss: 0.8772040009498596, rewards: -780, count: 690
success rate:  0.44
epoch: 118599, loss: 1.2057441473007202, rewards: -502, count: 457
success rate:  0.42
epoch: 118609, loss: 1.2642077207565308, rewards: -377, count: 359
success rate:  0.41
epoch: 118619, loss: 1.086120843887329, rewards: -166, count: 157
success rate:  0.42
epoch: 118629, loss: 1.1932086944580078, rewards: -75, count: 75
success rate:  0.42
epoch: 118639, loss: 0.6578836441040039, rewards: -520, count: 457
success rate:  0.41
epoch: 118649, loss: 0.8971464037895203, rewards: -418, count: 382
success rate:  0.44
epoch: 118659, loss: 1.0714302062988281, rewards: -76, count: 67
success rate:  0.43
epoch: 118669, loss: 1.3073877096176147, rewards: -107, count: 107
success rate:  0.4
epoch: 118679, loss: 0.9585816860198975, rewards: -37, count: 37
success rate:  0.41
epoch: 118689, loss: 1.565061330795288, rewards: -187, count: 169
success rate:  0.38
epoch: 118699, loss: 0.837851762

success rate:  0.3
epoch: 119549, loss: 1.381857991218567, rewards: -31, count: 31
success rate:  0.3
epoch: 119559, loss: 1.1177892684936523, rewards: -127, count: 118
success rate:  0.29
epoch: 119569, loss: 1.0883607864379883, rewards: -74, count: 74
success rate:  0.28
epoch: 119579, loss: 1.2541563510894775, rewards: -65, count: 56
success rate:  0.31
epoch: 119589, loss: 1.4682703018188477, rewards: -136, count: 136
success rate:  0.32
epoch: 119599, loss: 1.143635869026184, rewards: -49, count: 49
success rate:  0.34
epoch: 119609, loss: 1.8264825344085693, rewards: -112, count: 112
success rate:  0.33
epoch: 119619, loss: 2.0822675228118896, rewards: -182, count: 173
success rate:  0.33
epoch: 119629, loss: 1.8938190937042236, rewards: -176, count: 149
success rate:  0.35
epoch: 119639, loss: 0.7807439565658569, rewards: -334, count: 316
success rate:  0.36
epoch: 119649, loss: 1.6682677268981934, rewards: -170, count: 152
success rate:  0.38
epoch: 119659, loss: 1.146969079971

success rate:  0.26
epoch: 120509, loss: 0.952843964099884, rewards: -43, count: 43
success rate:  0.26
epoch: 120519, loss: 0.9057779908180237, rewards: -155, count: 137
success rate:  0.28
epoch: 120529, loss: 1.4865926504135132, rewards: -58, count: 58
success rate:  0.28
epoch: 120539, loss: 1.37726891040802, rewards: -58, count: 58
success rate:  0.3
epoch: 120549, loss: 1.4041528701782227, rewards: -171, count: 171
success rate:  0.32
epoch: 120559, loss: 1.8032681941986084, rewards: -74, count: 65
success rate:  0.31
epoch: 120569, loss: 0.8807080984115601, rewards: -585, count: 495
success rate:  0.33
epoch: 120579, loss: 0.7141292691230774, rewards: -99, count: 99
success rate:  0.32
epoch: 120589, loss: 1.4494200944900513, rewards: -146, count: 128
success rate:  0.36
epoch: 120599, loss: 1.7727793455123901, rewards: -128, count: 119
success rate:  0.36
epoch: 120609, loss: 1.6530256271362305, rewards: -77, count: 68
success rate:  0.37
epoch: 120619, loss: 1.7937039136886597

success rate:  0.33
epoch: 121479, loss: 0.982017457485199, rewards: -188, count: 179
success rate:  0.31
epoch: 121489, loss: 1.75557541847229, rewards: -93, count: 93
success rate:  0.34
epoch: 121499, loss: 1.1621677875518799, rewards: -603, count: 549
success rate:  0.35
epoch: 121509, loss: 0.5188913941383362, rewards: -157, count: 139
success rate:  0.35
epoch: 121519, loss: 0.9051513671875, rewards: -189, count: 180
success rate:  0.3
epoch: 121529, loss: 1.0895203351974487, rewards: -420, count: 375
success rate:  0.32
epoch: 121539, loss: 1.2289783954620361, rewards: -60, count: 60
success rate:  0.3
epoch: 121549, loss: 1.074447512626648, rewards: -348, count: 312
success rate:  0.32
epoch: 121559, loss: 1.8286843299865723, rewards: -70, count: 70
success rate:  0.29
epoch: 121569, loss: 0.8744367361068726, rewards: -358, count: 295
success rate:  0.28
epoch: 121579, loss: 0.4014585614204407, rewards: -109, count: 100
success rate:  0.26
epoch: 121589, loss: 0.994439959526062

success rate:  0.32
epoch: 122439, loss: 1.7493144273757935, rewards: -193, count: 166
success rate:  0.34
epoch: 122449, loss: 0.40288403630256653, rewards: -71, count: 71
success rate:  0.32
epoch: 122459, loss: 1.2673112154006958, rewards: -86, count: 77
success rate:  0.31
epoch: 122469, loss: 0.8426815867424011, rewards: -483, count: 420
success rate:  0.32
epoch: 122479, loss: 1.158444881439209, rewards: -209, count: 209
success rate:  0.33
epoch: 122489, loss: 1.0592467784881592, rewards: -529, count: 484
success rate:  0.33
epoch: 122499, loss: 0.36353349685668945, rewards: -215, count: 197
success rate:  0.35
epoch: 122509, loss: 0.8287683129310608, rewards: -232, count: 223
success rate:  0.37
epoch: 122519, loss: 0.5739255547523499, rewards: -128, count: 128
success rate:  0.35
epoch: 122529, loss: 0.8558390736579895, rewards: -474, count: 438
success rate:  0.35
epoch: 122539, loss: 0.981127142906189, rewards: -198, count: 189
success rate:  0.36
epoch: 122549, loss: 0.5813

success rate:  0.33
epoch: 123389, loss: 0.807410717010498, rewards: -214, count: 187
success rate:  0.31
epoch: 123399, loss: 0.9071211218833923, rewards: -343, count: 280
success rate:  0.31
epoch: 123409, loss: 0.8650366067886353, rewards: -347, count: 329
success rate:  0.34
epoch: 123419, loss: 0.966732382774353, rewards: -80, count: 80
success rate:  0.39
epoch: 123429, loss: 0.6203228831291199, rewards: -101, count: 101
success rate:  0.39
epoch: 123439, loss: 1.1808933019638062, rewards: -194, count: 167
success rate:  0.39
epoch: 123449, loss: 0.7164030075073242, rewards: -481, count: 436
success rate:  0.38
epoch: 123459, loss: 0.7800388932228088, rewards: -205, count: 196
success rate:  0.36
epoch: 123469, loss: 0.8926379680633545, rewards: -131, count: 131
success rate:  0.33
epoch: 123479, loss: 0.9901429414749146, rewards: -54, count: 54
success rate:  0.35
epoch: 123489, loss: 1.008421540260315, rewards: -480, count: 426
success rate:  0.35
epoch: 123499, loss: 1.7675262

success rate:  0.37
epoch: 124349, loss: 0.7866657376289368, rewards: -503, count: 449
success rate:  0.35
epoch: 124359, loss: 0.9272222518920898, rewards: -194, count: 176
success rate:  0.31
epoch: 124369, loss: 1.8538018465042114, rewards: -132, count: 132
success rate:  0.32
epoch: 124379, loss: 1.1019177436828613, rewards: -245, count: 236
success rate:  0.32
epoch: 124389, loss: 2.2443199157714844, rewards: -107, count: 98
success rate:  0.3
epoch: 124399, loss: 1.6095013618469238, rewards: -53, count: 53
success rate:  0.33
epoch: 124409, loss: 1.7966574430465698, rewards: -207, count: 207
success rate:  0.33
epoch: 124419, loss: 1.4436601400375366, rewards: -56, count: 56
success rate:  0.34
epoch: 124429, loss: 1.6219642162322998, rewards: -193, count: 193
success rate:  0.37
epoch: 124439, loss: 0.8719996809959412, rewards: -450, count: 414
success rate:  0.36
epoch: 124449, loss: 1.301593542098999, rewards: -193, count: 175
success rate:  0.39
epoch: 124459, loss: 1.2611654

success rate:  0.35
epoch: 125319, loss: 1.3471546173095703, rewards: -115, count: 106
success rate:  0.34
epoch: 125329, loss: 1.5985599756240845, rewards: -135, count: 135
success rate:  0.3
epoch: 125339, loss: 1.0065410137176514, rewards: -180, count: 171
success rate:  0.3
epoch: 125349, loss: 1.137086033821106, rewards: -108, count: 81
success rate:  0.3
epoch: 125359, loss: 1.169864296913147, rewards: -214, count: 196
success rate:  0.29
epoch: 125369, loss: 1.120477318763733, rewards: -160, count: 151
success rate:  0.32
epoch: 125379, loss: 1.7068798542022705, rewards: -110, count: 110
success rate:  0.33
epoch: 125389, loss: 2.300769090652466, rewards: -89, count: 89
success rate:  0.32
epoch: 125399, loss: 0.41713330149650574, rewards: -217, count: 208
success rate:  0.3
epoch: 125409, loss: 1.7152912616729736, rewards: -188, count: 170
success rate:  0.32
epoch: 125419, loss: 1.439152479171753, rewards: -283, count: 265
success rate:  0.32
epoch: 125429, loss: 1.24154126644

success rate:  0.36
epoch: 126289, loss: 0.8251044154167175, rewards: -372, count: 336
success rate:  0.38
epoch: 126299, loss: 1.1333931684494019, rewards: -321, count: 312
success rate:  0.4
epoch: 126309, loss: 1.4933967590332031, rewards: -397, count: 361
success rate:  0.39
epoch: 126319, loss: 0.9368361830711365, rewards: -430, count: 394
success rate:  0.34
epoch: 126329, loss: 1.2267158031463623, rewards: -334, count: 316
success rate:  0.34
epoch: 126339, loss: 1.0595951080322266, rewards: -100, count: 100
success rate:  0.36
epoch: 126349, loss: 0.7965698838233948, rewards: -752, count: 707
success rate:  0.33
epoch: 126359, loss: 1.1561229228973389, rewards: -89, count: 80
success rate:  0.34
epoch: 126369, loss: 1.439069151878357, rewards: -84, count: 84
success rate:  0.33
epoch: 126379, loss: 1.747582197189331, rewards: -44, count: 44
success rate:  0.34
epoch: 126389, loss: 0.6921430826187134, rewards: -133, count: 124
success rate:  0.3
epoch: 126399, loss: 1.7573431730

success rate:  0.35
epoch: 127249, loss: 1.776464581489563, rewards: -70, count: 70
success rate:  0.34
epoch: 127259, loss: 0.980270504951477, rewards: -69, count: 69
success rate:  0.32
epoch: 127269, loss: 1.2553515434265137, rewards: -112, count: 103
success rate:  0.31
epoch: 127279, loss: 1.874598741531372, rewards: -124, count: 124
success rate:  0.32
epoch: 127289, loss: 1.4220256805419922, rewards: -94, count: 94
success rate:  0.33
epoch: 127299, loss: 1.21141517162323, rewards: -192, count: 192
success rate:  0.35
epoch: 127309, loss: 1.401741623878479, rewards: -93, count: 84
success rate:  0.38
epoch: 127319, loss: 1.4936859607696533, rewards: -30, count: 30
success rate:  0.37
epoch: 127329, loss: 2.111245632171631, rewards: -117, count: 99
success rate:  0.37
epoch: 127339, loss: 1.3652080297470093, rewards: -276, count: 267
success rate:  0.34
epoch: 127349, loss: 0.7846885919570923, rewards: -60, count: 60
success rate:  0.36
epoch: 127359, loss: 1.7611286640167236, re

success rate:  0.3
epoch: 128219, loss: 0.5697754621505737, rewards: -54, count: 54
success rate:  0.31
epoch: 128229, loss: 1.0306187868118286, rewards: -225, count: 216
success rate:  0.29
epoch: 128239, loss: 1.2903836965560913, rewards: -142, count: 142
success rate:  0.32
epoch: 128249, loss: 0.9669955372810364, rewards: -259, count: 241
success rate:  0.35
epoch: 128259, loss: 0.7967813014984131, rewards: -947, count: 830
success rate:  0.35
epoch: 128269, loss: 0.8361861109733582, rewards: -311, count: 275
success rate:  0.39
epoch: 128279, loss: 0.7486937046051025, rewards: -42, count: 42
success rate:  0.34
epoch: 128289, loss: 1.2461445331573486, rewards: -87, count: 78
success rate:  0.33
epoch: 128299, loss: 0.6933896541595459, rewards: -81, count: 81
success rate:  0.32
epoch: 128309, loss: 0.9858641028404236, rewards: -87, count: 78
success rate:  0.32
epoch: 128319, loss: 1.298154592514038, rewards: -173, count: 164
success rate:  0.27
epoch: 128329, loss: 0.942334532737

success rate:  0.41
epoch: 129189, loss: 0.9305101037025452, rewards: -79, count: 79
success rate:  0.38
epoch: 129199, loss: 0.45436060428619385, rewards: -264, count: 237
success rate:  0.37
epoch: 129209, loss: 0.6686549186706543, rewards: -1086, count: 942
success rate:  0.39
epoch: 129219, loss: 0.8325251936912537, rewards: -589, count: 508
success rate:  0.39
epoch: 129229, loss: 1.1039228439331055, rewards: -60, count: 51
success rate:  0.42
epoch: 129239, loss: 0.7224154472351074, rewards: -309, count: 273
success rate:  0.41
epoch: 129249, loss: 0.6147380471229553, rewards: -245, count: 209
success rate:  0.4
epoch: 129259, loss: 1.0812709331512451, rewards: -222, count: 195
success rate:  0.43
epoch: 129269, loss: 0.3901313543319702, rewards: -66, count: 66
success rate:  0.42
epoch: 129279, loss: 0.7693096995353699, rewards: -59, count: 59
success rate:  0.4
epoch: 129289, loss: 1.0149258375167847, rewards: -249, count: 249
success rate:  0.41
epoch: 129299, loss: 0.80762648

success rate:  0.31
epoch: 130159, loss: 0.93141770362854, rewards: -189, count: 180
success rate:  0.3
epoch: 130169, loss: 1.5599946975708008, rewards: -96, count: 96
success rate:  0.31
epoch: 130179, loss: 0.6948471665382385, rewards: -391, count: 355
success rate:  0.36
epoch: 130189, loss: 1.6661001443862915, rewards: -40, count: 40
success rate:  0.4
epoch: 130199, loss: 0.8420406579971313, rewards: -415, count: 370
success rate:  0.41
epoch: 130209, loss: 1.0300612449645996, rewards: -306, count: 279
success rate:  0.41
epoch: 130219, loss: 0.528134822845459, rewards: -381, count: 327
success rate:  0.41
epoch: 130229, loss: 0.6677830219268799, rewards: -171, count: 153
success rate:  0.43
epoch: 130239, loss: 0.9662159085273743, rewards: -271, count: 244
success rate:  0.42
epoch: 130249, loss: 1.341654896736145, rewards: -237, count: 237
success rate:  0.39
epoch: 130259, loss: 0.5963869094848633, rewards: -310, count: 301
success rate:  0.4
epoch: 130269, loss: 0.70924806594

success rate:  0.35
epoch: 131109, loss: 0.6912803053855896, rewards: -282, count: 273
success rate:  0.32
epoch: 131119, loss: 1.5359725952148438, rewards: -133, count: 133
success rate:  0.32
epoch: 131129, loss: 1.4008654356002808, rewards: -213, count: 213
success rate:  0.32
epoch: 131139, loss: 1.8945304155349731, rewards: -66, count: 66
success rate:  0.29
epoch: 131149, loss: 1.4145413637161255, rewards: -27, count: 27
success rate:  0.31
epoch: 131159, loss: 1.7958096265792847, rewards: -158, count: 158
success rate:  0.29
epoch: 131169, loss: 0.554261326789856, rewards: -659, count: 524
success rate:  0.3
epoch: 131179, loss: 0.8821197748184204, rewards: -55, count: 55
success rate:  0.29
epoch: 131189, loss: 2.212714433670044, rewards: -56, count: 47
success rate:  0.31
epoch: 131199, loss: 1.1225851774215698, rewards: -229, count: 220
success rate:  0.3
epoch: 131209, loss: 1.011215329170227, rewards: -515, count: 479
success rate:  0.3
epoch: 131219, loss: 1.43169891834259

success rate:  0.34
epoch: 132069, loss: 1.7056095600128174, rewards: -141, count: 114
success rate:  0.31
epoch: 132079, loss: 1.3163063526153564, rewards: -75, count: 66
success rate:  0.29
epoch: 132089, loss: 1.1449613571166992, rewards: -480, count: 444
success rate:  0.27
epoch: 132099, loss: 1.2855490446090698, rewards: -135, count: 135
success rate:  0.29
epoch: 132109, loss: 1.4144200086593628, rewards: -37, count: 37
success rate:  0.29
epoch: 132119, loss: 1.8092553615570068, rewards: -81, count: 81
success rate:  0.3
epoch: 132129, loss: 1.463362693786621, rewards: -178, count: 160
success rate:  0.31
epoch: 132139, loss: 1.2071162462234497, rewards: -154, count: 154
success rate:  0.27
epoch: 132149, loss: 1.2455060482025146, rewards: -278, count: 242
success rate:  0.25
epoch: 132159, loss: 1.9014763832092285, rewards: -75, count: 75
success rate:  0.23
epoch: 132169, loss: 1.9192912578582764, rewards: -107, count: 98
success rate:  0.24
epoch: 132179, loss: 1.74708282947

success rate:  0.26
epoch: 133029, loss: 0.45638084411621094, rewards: -236, count: 209
success rate:  0.31
epoch: 133039, loss: 1.3206461668014526, rewards: -206, count: 197
success rate:  0.31
epoch: 133049, loss: 1.3805994987487793, rewards: -143, count: 134
success rate:  0.3
epoch: 133059, loss: 0.6364403963088989, rewards: -29, count: 29
success rate:  0.28
epoch: 133069, loss: 0.7141551971435547, rewards: -319, count: 301
success rate:  0.29
epoch: 133079, loss: 1.3671518564224243, rewards: -180, count: 153
success rate:  0.27
epoch: 133089, loss: 1.576615810394287, rewards: -161, count: 152
success rate:  0.29
epoch: 133099, loss: 1.2880275249481201, rewards: -110, count: 110
success rate:  0.36
epoch: 133109, loss: 1.432760238647461, rewards: -77, count: 77
success rate:  0.35
epoch: 133119, loss: 1.0678951740264893, rewards: -229, count: 202
success rate:  0.37
epoch: 133129, loss: 1.015960931777954, rewards: -505, count: 487
success rate:  0.36
epoch: 133139, loss: 1.6305203

success rate:  0.27
epoch: 133989, loss: 0.9091761708259583, rewards: -456, count: 357
success rate:  0.28
epoch: 133999, loss: 0.6192231774330139, rewards: -346, count: 310
success rate:  0.28
epoch: 134009, loss: 1.0111932754516602, rewards: -234, count: 180
success rate:  0.29
epoch: 134019, loss: 0.8228223323822021, rewards: -67, count: 67
success rate:  0.29
epoch: 134029, loss: 0.9177780747413635, rewards: -412, count: 358
success rate:  0.3
epoch: 134039, loss: 0.6897883415222168, rewards: -201, count: 183
success rate:  0.29
epoch: 134049, loss: 1.1731172800064087, rewards: -388, count: 361
success rate:  0.29
epoch: 134059, loss: 0.45890024304389954, rewards: -199, count: 172
success rate:  0.35
epoch: 134069, loss: 0.4097152054309845, rewards: -157, count: 157
success rate:  0.33
epoch: 134079, loss: 0.6923313736915588, rewards: -283, count: 247
success rate:  0.32
epoch: 134089, loss: 0.5674698352813721, rewards: -428, count: 383
success rate:  0.28
epoch: 134099, loss: 1.10

success rate:  0.36
epoch: 134949, loss: 0.6269192099571228, rewards: -602, count: 539
success rate:  0.36
epoch: 134959, loss: 0.6222882866859436, rewards: -100, count: 91
success rate:  0.36
epoch: 134969, loss: 1.494285225868225, rewards: -180, count: 153
success rate:  0.36
epoch: 134979, loss: 1.6895170211791992, rewards: -129, count: 102
success rate:  0.34
epoch: 134989, loss: 0.8357074856758118, rewards: -207, count: 198
success rate:  0.34
epoch: 134999, loss: 0.6409481763839722, rewards: -187, count: 187
success rate:  0.33
epoch: 135009, loss: 1.1689584255218506, rewards: -48, count: 48
success rate:  0.32
epoch: 135019, loss: 1.3300416469573975, rewards: -70, count: 52
success rate:  0.28
epoch: 135029, loss: 1.0353145599365234, rewards: -192, count: 165
success rate:  0.28
epoch: 135039, loss: 0.9778236150741577, rewards: -350, count: 296
success rate:  0.28
epoch: 135049, loss: 1.0883418321609497, rewards: -209, count: 191
success rate:  0.3
epoch: 135059, loss: 0.5943794

success rate:  0.33
epoch: 135909, loss: 0.9229145050048828, rewards: -499, count: 481
success rate:  0.32
epoch: 135919, loss: 0.939514696598053, rewards: -418, count: 355
success rate:  0.33
epoch: 135929, loss: 1.2579714059829712, rewards: -57, count: 57
success rate:  0.32
epoch: 135939, loss: 1.265795111656189, rewards: -190, count: 172
success rate:  0.35
epoch: 135949, loss: 1.2577974796295166, rewards: -40, count: 40
success rate:  0.37
epoch: 135959, loss: 1.0784937143325806, rewards: -309, count: 273
success rate:  0.35
epoch: 135969, loss: 0.8978520035743713, rewards: -270, count: 234
success rate:  0.32
epoch: 135979, loss: 1.4834249019622803, rewards: -66, count: 57
success rate:  0.34
epoch: 135989, loss: 1.152192234992981, rewards: -429, count: 375
success rate:  0.38
epoch: 135999, loss: 1.5615898370742798, rewards: -66, count: 57
success rate:  0.38
epoch: 136009, loss: 0.9182987213134766, rewards: -358, count: 340
success rate:  0.42
epoch: 136019, loss: 0.79507672786

success rate:  0.46
epoch: 136879, loss: 1.5684943199157715, rewards: -45, count: 45
success rate:  0.42
epoch: 136889, loss: 2.1478872299194336, rewards: -200, count: 173
success rate:  0.44
epoch: 136899, loss: 1.5766593217849731, rewards: -62, count: 62
success rate:  0.45
epoch: 136909, loss: 1.7501376867294312, rewards: -129, count: 111
success rate:  0.42
epoch: 136919, loss: 1.549733281135559, rewards: -188, count: 179
success rate:  0.38
epoch: 136929, loss: 1.2397016286849976, rewards: -41, count: 41
success rate:  0.37
epoch: 136939, loss: 1.4104877710342407, rewards: -89, count: 89
success rate:  0.38
epoch: 136949, loss: 0.50371915102005, rewards: -267, count: 249
success rate:  0.39
epoch: 136959, loss: 1.078273057937622, rewards: -575, count: 539
success rate:  0.37
epoch: 136969, loss: 1.086366891860962, rewards: -380, count: 317
success rate:  0.37
epoch: 136979, loss: 2.0732574462890625, rewards: -81, count: 81
success rate:  0.37
epoch: 136989, loss: 0.206698626279830

success rate:  0.32
epoch: 137839, loss: 0.6259219646453857, rewards: -425, count: 416
success rate:  0.32
epoch: 137849, loss: 0.5686239004135132, rewards: -80, count: 80
success rate:  0.31
epoch: 137859, loss: 0.9028943777084351, rewards: -157, count: 148
success rate:  0.3
epoch: 137869, loss: 0.8041074872016907, rewards: -297, count: 297
success rate:  0.31
epoch: 137879, loss: 0.36280372738838196, rewards: -694, count: 622
success rate:  0.34
epoch: 137889, loss: 0.7144661545753479, rewards: -188, count: 188
success rate:  0.32
epoch: 137899, loss: 0.9344038367271423, rewards: -244, count: 226
success rate:  0.33
epoch: 137909, loss: 0.9860467910766602, rewards: -251, count: 251
success rate:  0.36
epoch: 137919, loss: 0.9312776327133179, rewards: -665, count: 647
success rate:  0.36
epoch: 137929, loss: 1.1809401512145996, rewards: -94, count: 85
success rate:  0.37
epoch: 137939, loss: 1.7916977405548096, rewards: -142, count: 133
success rate:  0.35
epoch: 137949, loss: 0.6272

success rate:  0.38
epoch: 138799, loss: 0.4249531626701355, rewards: -69, count: 69
success rate:  0.36
epoch: 138809, loss: 0.6715724468231201, rewards: -359, count: 341
success rate:  0.32
epoch: 138819, loss: 0.7578028440475464, rewards: -321, count: 303
success rate:  0.33
epoch: 138829, loss: 0.8820682168006897, rewards: -320, count: 293
success rate:  0.35
epoch: 138839, loss: 0.7038153409957886, rewards: -97, count: 97
success rate:  0.34
epoch: 138849, loss: 0.6819002032279968, rewards: -318, count: 300
success rate:  0.37
epoch: 138859, loss: 0.6842086315155029, rewards: -640, count: 550
success rate:  0.39
epoch: 138869, loss: 0.3715353012084961, rewards: -41, count: 41
success rate:  0.36
epoch: 138879, loss: 0.537007212638855, rewards: -183, count: 174
success rate:  0.34
epoch: 138889, loss: 0.4043210744857788, rewards: -217, count: 208
success rate:  0.36
epoch: 138899, loss: 0.8175163865089417, rewards: -103, count: 94
success rate:  0.34
epoch: 138909, loss: 0.98615169

success rate:  0.32
epoch: 139749, loss: 0.7045646905899048, rewards: -191, count: 182
success rate:  0.29
epoch: 139759, loss: 0.7992156147956848, rewards: -114, count: 114
success rate:  0.28
epoch: 139769, loss: 0.24433404207229614, rewards: -108, count: 99
success rate:  0.26
epoch: 139779, loss: 0.683636486530304, rewards: -152, count: 143
success rate:  0.25
epoch: 139789, loss: 0.5685427188873291, rewards: -164, count: 146
success rate:  0.23
epoch: 139799, loss: 0.4499235451221466, rewards: -111, count: 111
success rate:  0.26
epoch: 139809, loss: 1.0442383289337158, rewards: -145, count: 145
success rate:  0.3
epoch: 139819, loss: 0.8473145365715027, rewards: -421, count: 394
success rate:  0.29
epoch: 139829, loss: 0.3783377408981323, rewards: -202, count: 184
success rate:  0.28
epoch: 139839, loss: 0.6477391719818115, rewards: -283, count: 256
success rate:  0.27
epoch: 139849, loss: 0.429779589176178, rewards: -377, count: 350
success rate:  0.26
epoch: 139859, loss: 1.025

success rate:  0.37
epoch: 140699, loss: 0.786176323890686, rewards: -255, count: 237
success rate:  0.39
epoch: 140709, loss: 1.0839238166809082, rewards: -213, count: 195
success rate:  0.38
epoch: 140719, loss: 0.7762512564659119, rewards: -52, count: 52
success rate:  0.37
epoch: 140729, loss: 1.0560965538024902, rewards: -108, count: 108
success rate:  0.37
epoch: 140739, loss: 1.2982635498046875, rewards: -141, count: 132
success rate:  0.36
epoch: 140749, loss: 0.9903151988983154, rewards: -142, count: 142
success rate:  0.35
epoch: 140759, loss: 1.4260051250457764, rewards: -124, count: 106
success rate:  0.36
epoch: 140769, loss: 0.8809440732002258, rewards: -136, count: 136
success rate:  0.38
epoch: 140779, loss: 1.0761935710906982, rewards: -147, count: 138
success rate:  0.38
epoch: 140789, loss: 0.8939724564552307, rewards: -344, count: 326
success rate:  0.4
epoch: 140799, loss: 0.9498397707939148, rewards: -66, count: 66
success rate:  0.39
epoch: 140809, loss: 0.771123

success rate:  0.25
epoch: 141649, loss: 0.8664307594299316, rewards: -57, count: 48
success rate:  0.28
epoch: 141659, loss: 0.48475372791290283, rewards: -278, count: 260
success rate:  0.27
epoch: 141669, loss: 0.6540936827659607, rewards: -291, count: 246
success rate:  0.27
epoch: 141679, loss: 0.5506805777549744, rewards: -93, count: 93
success rate:  0.27
epoch: 141689, loss: 1.2052562236785889, rewards: -202, count: 166
success rate:  0.27
epoch: 141699, loss: 0.5975936055183411, rewards: -217, count: 199
success rate:  0.27
epoch: 141709, loss: 0.8602858781814575, rewards: -96, count: 87
success rate:  0.29
epoch: 141719, loss: 1.2576687335968018, rewards: -201, count: 201
success rate:  0.28
epoch: 141729, loss: 0.6201133728027344, rewards: -354, count: 318
success rate:  0.29
epoch: 141739, loss: 0.9338699579238892, rewards: -146, count: 137
success rate:  0.32
epoch: 141749, loss: 0.7166494131088257, rewards: -160, count: 151
success rate:  0.32
epoch: 141759, loss: 0.25918

success rate:  0.31
epoch: 142609, loss: 0.479735791683197, rewards: -371, count: 362
success rate:  0.3
epoch: 142619, loss: 1.0575571060180664, rewards: -269, count: 233
success rate:  0.29
epoch: 142629, loss: 0.951353907585144, rewards: -127, count: 118
success rate:  0.29
epoch: 142639, loss: 0.7042104005813599, rewards: -345, count: 327
success rate:  0.33
epoch: 142649, loss: 1.1034976243972778, rewards: -257, count: 257
success rate:  0.34
epoch: 142659, loss: 0.726513147354126, rewards: -120, count: 102
success rate:  0.34
epoch: 142669, loss: 1.298264503479004, rewards: -79, count: 79
success rate:  0.37
epoch: 142679, loss: 0.9111866354942322, rewards: -191, count: 173
success rate:  0.39
epoch: 142689, loss: 0.923481822013855, rewards: -209, count: 200
success rate:  0.38
epoch: 142699, loss: 0.9536621570587158, rewards: -89, count: 89
success rate:  0.37
epoch: 142709, loss: 0.7672501802444458, rewards: -165, count: 147
success rate:  0.37
epoch: 142719, loss: 0.5667005777

success rate:  0.28
epoch: 143579, loss: 1.271624207496643, rewards: -145, count: 136
success rate:  0.29
epoch: 143589, loss: 1.2627161741256714, rewards: -89, count: 89
success rate:  0.32
epoch: 143599, loss: 0.5824438333511353, rewards: -405, count: 387
success rate:  0.32
epoch: 143609, loss: 0.6367349624633789, rewards: -431, count: 359
success rate:  0.34
epoch: 143619, loss: 0.721052348613739, rewards: -509, count: 464
success rate:  0.32
epoch: 143629, loss: 0.5146090984344482, rewards: -527, count: 464
success rate:  0.32
epoch: 143639, loss: 0.563382625579834, rewards: -310, count: 274
success rate:  0.33
epoch: 143649, loss: 0.7253056764602661, rewards: -150, count: 123
success rate:  0.32
epoch: 143659, loss: 0.8767415881156921, rewards: -257, count: 239
success rate:  0.37
epoch: 143669, loss: 0.616104006767273, rewards: -174, count: 165
success rate:  0.38
epoch: 143679, loss: 0.9673877358436584, rewards: -107, count: 98
success rate:  0.36
epoch: 143689, loss: 0.9033367

success rate:  0.3
epoch: 144549, loss: 0.9893398284912109, rewards: -267, count: 249
success rate:  0.3
epoch: 144559, loss: 0.7075144052505493, rewards: -446, count: 410
success rate:  0.32
epoch: 144569, loss: 0.7606719732284546, rewards: -106, count: 97
success rate:  0.34
epoch: 144579, loss: 0.5522040724754333, rewards: -328, count: 274
success rate:  0.35
epoch: 144589, loss: 1.3888030052185059, rewards: -132, count: 132
success rate:  0.34
epoch: 144599, loss: 0.9124539494514465, rewards: -245, count: 209
success rate:  0.36
epoch: 144609, loss: 0.2612209916114807, rewards: -47, count: 47
success rate:  0.36
epoch: 144619, loss: 0.9984642267227173, rewards: -234, count: 216
success rate:  0.37
epoch: 144629, loss: 0.6053771376609802, rewards: -142, count: 142
success rate:  0.39
epoch: 144639, loss: 1.0928993225097656, rewards: -64, count: 55
success rate:  0.38
epoch: 144649, loss: 0.5866991281509399, rewards: -183, count: 174
success rate:  0.4
epoch: 144659, loss: 1.01243245

success rate:  0.33
epoch: 145499, loss: 1.1361140012741089, rewards: -292, count: 283
success rate:  0.33
epoch: 145509, loss: 0.932905912399292, rewards: -227, count: 182
success rate:  0.31
epoch: 145519, loss: 0.7779572606086731, rewards: -233, count: 206
success rate:  0.33
epoch: 145529, loss: 0.23767133057117462, rewards: -114, count: 114
success rate:  0.35
epoch: 145539, loss: 0.8077830672264099, rewards: -216, count: 207
success rate:  0.34
epoch: 145549, loss: 0.7441131472587585, rewards: -372, count: 354
success rate:  0.32
epoch: 145559, loss: 0.7321001291275024, rewards: -208, count: 208
success rate:  0.33
epoch: 145569, loss: 0.8666988611221313, rewards: -150, count: 141
success rate:  0.31
epoch: 145579, loss: 1.046040654182434, rewards: -133, count: 115
success rate:  0.27
epoch: 145589, loss: 0.5967335104942322, rewards: -122, count: 113
success rate:  0.26
epoch: 145599, loss: 1.2332533597946167, rewards: -92, count: 83
success rate:  0.28
epoch: 145609, loss: 0.597

success rate:  0.28
epoch: 146469, loss: 1.0877066850662231, rewards: -740, count: 695
success rate:  0.29
epoch: 146479, loss: 0.809349775314331, rewards: -27, count: 27
success rate:  0.31
epoch: 146489, loss: 0.9651021957397461, rewards: -86, count: 86
success rate:  0.31
epoch: 146499, loss: 0.8971287608146667, rewards: -183, count: 174
success rate:  0.32
epoch: 146509, loss: 1.3092595338821411, rewards: -304, count: 286
success rate:  0.31
epoch: 146519, loss: 1.1827133893966675, rewards: -244, count: 217
success rate:  0.31
epoch: 146529, loss: 0.5921088457107544, rewards: -525, count: 480
success rate:  0.32
epoch: 146539, loss: 1.3774605989456177, rewards: -193, count: 166
success rate:  0.3
epoch: 146549, loss: 1.153583288192749, rewards: -157, count: 148
success rate:  0.3
epoch: 146559, loss: 1.1896812915802002, rewards: -209, count: 182
success rate:  0.31
epoch: 146569, loss: 0.9601292610168457, rewards: -403, count: 376
success rate:  0.33
epoch: 146579, loss: 1.07509768

success rate:  0.4
epoch: 147439, loss: 0.5303430557250977, rewards: -216, count: 207
success rate:  0.38
epoch: 147449, loss: 0.8111960291862488, rewards: -233, count: 224
success rate:  0.34
epoch: 147459, loss: 0.7831529974937439, rewards: -364, count: 355
success rate:  0.33
epoch: 147469, loss: 0.9036155939102173, rewards: -52, count: 52
success rate:  0.37
epoch: 147479, loss: 0.7934479117393494, rewards: -151, count: 151
success rate:  0.36
epoch: 147489, loss: 0.6284185647964478, rewards: -63, count: 63
success rate:  0.33
epoch: 147499, loss: 0.7451378703117371, rewards: -397, count: 379
success rate:  0.33
epoch: 147509, loss: 0.7219402194023132, rewards: -299, count: 263
success rate:  0.3
epoch: 147519, loss: 1.0260905027389526, rewards: -253, count: 235
success rate:  0.33
epoch: 147529, loss: 0.39596661925315857, rewards: -497, count: 434
success rate:  0.33
epoch: 147539, loss: 0.6935255527496338, rewards: -376, count: 331
success rate:  0.32
epoch: 147549, loss: 0.71095

success rate:  0.33
epoch: 148399, loss: 0.5022265315055847, rewards: -644, count: 563
success rate:  0.35
epoch: 148409, loss: 0.5331953167915344, rewards: -241, count: 223
success rate:  0.32
epoch: 148419, loss: 1.1978263854980469, rewards: -374, count: 347
success rate:  0.31
epoch: 148429, loss: 0.6438330411911011, rewards: -284, count: 266
success rate:  0.32
epoch: 148439, loss: 1.166322946548462, rewards: -118, count: 118
success rate:  0.35
epoch: 148449, loss: 0.8918770551681519, rewards: -234, count: 216
success rate:  0.37
epoch: 148459, loss: 1.1403182744979858, rewards: -182, count: 173
success rate:  0.33
epoch: 148469, loss: 0.6186004877090454, rewards: -346, count: 292
success rate:  0.35
epoch: 148479, loss: 1.5300453901290894, rewards: -64, count: 64
success rate:  0.32
epoch: 148489, loss: 0.745529055595398, rewards: -646, count: 556
success rate:  0.34
epoch: 148499, loss: 0.3038288652896881, rewards: -394, count: 358
success rate:  0.33
epoch: 148509, loss: 0.9147

success rate:  0.33
epoch: 149349, loss: 0.42475348711013794, rewards: -139, count: 139
success rate:  0.33
epoch: 149359, loss: 0.607605516910553, rewards: -65, count: 65
success rate:  0.38
epoch: 149369, loss: 1.1632519960403442, rewards: -110, count: 110
success rate:  0.35
epoch: 149379, loss: 0.8849654197692871, rewards: -205, count: 196
success rate:  0.33
epoch: 149389, loss: 0.4566153883934021, rewards: -746, count: 692
success rate:  0.32
epoch: 149399, loss: 0.49348363280296326, rewards: -593, count: 539
success rate:  0.31
epoch: 149409, loss: 0.5933123230934143, rewards: -60, count: 42
success rate:  0.3
epoch: 149419, loss: 0.5024504661560059, rewards: -87, count: 87
success rate:  0.33
epoch: 149429, loss: 0.6435087323188782, rewards: -72, count: 72
success rate:  0.34
epoch: 149439, loss: 0.9826087355613708, rewards: -56, count: 56
success rate:  0.34
epoch: 149449, loss: 0.6425744295120239, rewards: -130, count: 121
success rate:  0.31
epoch: 149459, loss: 0.5052332282

success rate:  0.33
epoch: 150299, loss: 0.5779451727867126, rewards: -107, count: 89
success rate:  0.34
epoch: 150309, loss: 0.27559518814086914, rewards: -50, count: 50
success rate:  0.35
epoch: 150319, loss: 0.884468674659729, rewards: -467, count: 431
success rate:  0.33
epoch: 150329, loss: 0.5255143046379089, rewards: -497, count: 452
success rate:  0.32
epoch: 150339, loss: 1.6709089279174805, rewards: -46, count: 46
success rate:  0.34
epoch: 150349, loss: 1.0426994562149048, rewards: -222, count: 213
success rate:  0.36
epoch: 150359, loss: 0.7500942945480347, rewards: -243, count: 225
success rate:  0.38
epoch: 150369, loss: 0.584775984287262, rewards: -665, count: 611
success rate:  0.35
epoch: 150379, loss: 0.8953531384468079, rewards: -395, count: 368
success rate:  0.34
epoch: 150389, loss: 1.6644412279129028, rewards: -74, count: 65
success rate:  0.34
epoch: 150399, loss: 0.6771155595779419, rewards: -151, count: 142
success rate:  0.33
epoch: 150409, loss: 0.70133268

success rate:  0.41
epoch: 151269, loss: 0.5037750601768494, rewards: -650, count: 587
success rate:  0.39
epoch: 151279, loss: 0.5009394288063049, rewards: -456, count: 393
success rate:  0.4
epoch: 151289, loss: 0.764214813709259, rewards: -67, count: 67
success rate:  0.41
epoch: 151299, loss: 1.1490198373794556, rewards: -61, count: 61
success rate:  0.38
epoch: 151309, loss: 1.001354455947876, rewards: -355, count: 310
success rate:  0.37
epoch: 151319, loss: 1.0436991453170776, rewards: -84, count: 84
success rate:  0.34
epoch: 151329, loss: 0.6841356754302979, rewards: -142, count: 133
success rate:  0.29
epoch: 151339, loss: 0.762428879737854, rewards: -423, count: 387
success rate:  0.29
epoch: 151349, loss: 1.3953863382339478, rewards: -102, count: 102
success rate:  0.3
epoch: 151359, loss: 1.299522876739502, rewards: -134, count: 125
success rate:  0.33
epoch: 151369, loss: 0.7538398504257202, rewards: -81, count: 72
success rate:  0.33
epoch: 151379, loss: 0.70462352037429

success rate:  0.34
epoch: 152229, loss: 0.9412848949432373, rewards: -236, count: 218
success rate:  0.34
epoch: 152239, loss: 0.7531506419181824, rewards: -270, count: 234
success rate:  0.33
epoch: 152249, loss: 0.5727588534355164, rewards: -766, count: 703
success rate:  0.33
epoch: 152259, loss: 0.8527271747589111, rewards: -247, count: 238
success rate:  0.34
epoch: 152269, loss: 0.3218500018119812, rewards: -202, count: 193
success rate:  0.36
epoch: 152279, loss: 1.1727294921875, rewards: -355, count: 328
success rate:  0.34
epoch: 152289, loss: 0.7795223593711853, rewards: -239, count: 185
success rate:  0.35
epoch: 152299, loss: 0.8163594603538513, rewards: -228, count: 228
success rate:  0.36
epoch: 152309, loss: 0.8784142136573792, rewards: -91, count: 91
success rate:  0.38
epoch: 152319, loss: 1.0953456163406372, rewards: -64, count: 64
success rate:  0.41
epoch: 152329, loss: 1.0099270343780518, rewards: -225, count: 198
success rate:  0.37
epoch: 152339, loss: 0.3458249

success rate:  0.31
epoch: 153199, loss: 0.9265036582946777, rewards: -463, count: 445
success rate:  0.32
epoch: 153209, loss: 0.9629391431808472, rewards: -338, count: 302
success rate:  0.33
epoch: 153219, loss: 1.4400980472564697, rewards: -41, count: 41
success rate:  0.32
epoch: 153229, loss: 0.9806534647941589, rewards: -410, count: 383
success rate:  0.32
epoch: 153239, loss: 0.09462346881628036, rewards: -353, count: 317
success rate:  0.31
epoch: 153249, loss: 0.6029493808746338, rewards: -102, count: 93
success rate:  0.31
epoch: 153259, loss: 0.60291588306427, rewards: -516, count: 435
success rate:  0.33
epoch: 153269, loss: 1.1888689994812012, rewards: -114, count: 114
success rate:  0.33
epoch: 153279, loss: 1.1926566362380981, rewards: -62, count: 62
success rate:  0.31
epoch: 153289, loss: 0.9452396035194397, rewards: -202, count: 193
success rate:  0.3
epoch: 153299, loss: 1.6146118640899658, rewards: -180, count: 171
success rate:  0.32
epoch: 153309, loss: 0.9196327

success rate:  0.36
epoch: 154159, loss: 0.8304955363273621, rewards: -1010, count: 911
success rate:  0.34
epoch: 154169, loss: 1.0844885110855103, rewards: -132, count: 114
success rate:  0.35
epoch: 154179, loss: 0.6043215990066528, rewards: -911, count: 821
success rate:  0.34
epoch: 154189, loss: 1.3602756261825562, rewards: -79, count: 70
success rate:  0.3
epoch: 154199, loss: 0.6393684148788452, rewards: -139, count: 130
success rate:  0.32
epoch: 154209, loss: 1.229864239692688, rewards: -311, count: 275
success rate:  0.3
epoch: 154219, loss: 0.908992350101471, rewards: -150, count: 141
success rate:  0.29
epoch: 154229, loss: 1.4854230880737305, rewards: -108, count: 108
success rate:  0.29
epoch: 154239, loss: 1.3315324783325195, rewards: -149, count: 149
success rate:  0.3
epoch: 154249, loss: 0.8892897963523865, rewards: -265, count: 238
success rate:  0.32
epoch: 154259, loss: 1.0609002113342285, rewards: -57, count: 57
success rate:  0.31
epoch: 154269, loss: 1.44891738

success rate:  0.31
epoch: 155119, loss: 1.2043567895889282, rewards: -78, count: 69
success rate:  0.36
epoch: 155129, loss: 1.5583550930023193, rewards: -109, count: 109
success rate:  0.35
epoch: 155139, loss: 1.652537226676941, rewards: -129, count: 120
success rate:  0.33
epoch: 155149, loss: 1.511938452720642, rewards: -517, count: 472
success rate:  0.31
epoch: 155159, loss: 1.2571039199829102, rewards: -262, count: 244
success rate:  0.29
epoch: 155169, loss: 1.0617362260818481, rewards: -193, count: 175
success rate:  0.31
epoch: 155179, loss: 0.7677730917930603, rewards: -237, count: 228
success rate:  0.34
epoch: 155189, loss: 0.6260480284690857, rewards: -112, count: 103
success rate:  0.32
epoch: 155199, loss: 0.9083542823791504, rewards: -257, count: 239
success rate:  0.31
epoch: 155209, loss: 0.533213198184967, rewards: -822, count: 750
success rate:  0.35
epoch: 155219, loss: 0.8764421343803406, rewards: -271, count: 235
success rate:  0.33
epoch: 155229, loss: 1.35989

success rate:  0.25
epoch: 156089, loss: 1.0975764989852905, rewards: -124, count: 124
success rate:  0.28
epoch: 156099, loss: 1.0587860345840454, rewards: -82, count: 82
success rate:  0.27
epoch: 156109, loss: 0.5091621279716492, rewards: -56, count: 56
success rate:  0.28
epoch: 156119, loss: 1.1682482957839966, rewards: -233, count: 224
success rate:  0.27
epoch: 156129, loss: 1.0942869186401367, rewards: -103, count: 94
success rate:  0.28
epoch: 156139, loss: 0.4194604754447937, rewards: -122, count: 86
success rate:  0.29
epoch: 156149, loss: 0.9232414364814758, rewards: -325, count: 262
success rate:  0.3
epoch: 156159, loss: 1.172580361366272, rewards: -265, count: 229
success rate:  0.3
epoch: 156169, loss: 1.215196132659912, rewards: -378, count: 351
success rate:  0.29
epoch: 156179, loss: 1.2342982292175293, rewards: -604, count: 532
success rate:  0.31
epoch: 156189, loss: 1.4009325504302979, rewards: -188, count: 170
success rate:  0.26
epoch: 156199, loss: 0.5985438823

success rate:  0.39
epoch: 157059, loss: 0.5582666397094727, rewards: -293, count: 266
success rate:  0.39
epoch: 157069, loss: 0.991719663143158, rewards: -541, count: 469
success rate:  0.34
epoch: 157079, loss: 1.5387861728668213, rewards: -185, count: 185
success rate:  0.37
epoch: 157089, loss: 0.5750797390937805, rewards: -70, count: 70
success rate:  0.33
epoch: 157099, loss: 1.2950044870376587, rewards: -263, count: 218
success rate:  0.3
epoch: 157109, loss: 1.509450078010559, rewards: -71, count: 71
success rate:  0.31
epoch: 157119, loss: 0.896296501159668, rewards: -251, count: 206
success rate:  0.3
epoch: 157129, loss: 0.595650315284729, rewards: -430, count: 403
success rate:  0.32
epoch: 157139, loss: 0.6168600916862488, rewards: -587, count: 542
success rate:  0.32
epoch: 157149, loss: 1.056908130645752, rewards: -24, count: 24
success rate:  0.34
epoch: 157159, loss: 1.0992907285690308, rewards: -102, count: 102
success rate:  0.34
epoch: 157169, loss: 0.8366085290908

success rate:  0.36
epoch: 158029, loss: 0.6827055215835571, rewards: -137, count: 119
success rate:  0.35
epoch: 158039, loss: 0.9738839268684387, rewards: -153, count: 144
success rate:  0.33
epoch: 158049, loss: 0.6879302859306335, rewards: -70, count: 61
success rate:  0.35
epoch: 158059, loss: 0.8744269013404846, rewards: -110, count: 110
success rate:  0.35
epoch: 158069, loss: 0.603384792804718, rewards: -237, count: 219
success rate:  0.34
epoch: 158079, loss: 1.1082515716552734, rewards: -90, count: 90
success rate:  0.35
epoch: 158089, loss: 0.8893985748291016, rewards: -450, count: 405
success rate:  0.35
epoch: 158099, loss: 1.5140552520751953, rewards: -142, count: 115
success rate:  0.33
epoch: 158109, loss: 0.9934874773025513, rewards: -254, count: 236
success rate:  0.31
epoch: 158119, loss: 1.5180084705352783, rewards: -238, count: 220
success rate:  0.3
epoch: 158129, loss: 0.95939040184021, rewards: -355, count: 328
success rate:  0.31
epoch: 158139, loss: 0.81333172

success rate:  0.41
epoch: 158979, loss: 0.9362452030181885, rewards: -67, count: 67
success rate:  0.4
epoch: 158989, loss: 1.415250539779663, rewards: -146, count: 146
success rate:  0.39
epoch: 158999, loss: 1.1226248741149902, rewards: -269, count: 260
success rate:  0.39
epoch: 159009, loss: 1.521979808807373, rewards: -58, count: 49
success rate:  0.38
epoch: 159019, loss: 0.966461181640625, rewards: -407, count: 380
success rate:  0.39
epoch: 159029, loss: 0.9889612197875977, rewards: -363, count: 327
success rate:  0.39
epoch: 159039, loss: 1.1657887697219849, rewards: -289, count: 271
success rate:  0.37
epoch: 159049, loss: 0.8631851673126221, rewards: -803, count: 749
success rate:  0.39
epoch: 159059, loss: 1.0398398637771606, rewards: -349, count: 349
success rate:  0.39
epoch: 159069, loss: 1.2442998886108398, rewards: -83, count: 83
success rate:  0.41
epoch: 159079, loss: 0.6963998079299927, rewards: -32, count: 32
success rate:  0.4
epoch: 159089, loss: 0.8069429397583

success rate:  0.32
epoch: 159939, loss: 0.8675814270973206, rewards: -381, count: 345
success rate:  0.33
epoch: 159949, loss: 1.0617519617080688, rewards: -108, count: 99
success rate:  0.35
epoch: 159959, loss: 1.2358458042144775, rewards: -49, count: 49
success rate:  0.33
epoch: 159969, loss: 1.0187016725540161, rewards: -388, count: 379
success rate:  0.33
epoch: 159979, loss: 1.123375415802002, rewards: -82, count: 73
success rate:  0.36
epoch: 159989, loss: 0.28682979941368103, rewards: -41, count: 41
success rate:  0.34
epoch: 159999, loss: 1.7494378089904785, rewards: -128, count: 119
success rate:  0.34
epoch: 160009, loss: 1.2853344678878784, rewards: -123, count: 123
success rate:  0.34
epoch: 160019, loss: 1.13959538936615, rewards: -185, count: 176
success rate:  0.35
epoch: 160029, loss: 1.0558356046676636, rewards: -136, count: 136
success rate:  0.35
epoch: 160039, loss: 1.3863401412963867, rewards: -154, count: 145
success rate:  0.33
epoch: 160049, loss: 1.221214175

success rate:  0.34
epoch: 160909, loss: 0.4337693452835083, rewards: -145, count: 136
success rate:  0.36
epoch: 160919, loss: 0.3937731683254242, rewards: -91, count: 91
success rate:  0.36
epoch: 160929, loss: 0.4047742486000061, rewards: -157, count: 139
success rate:  0.35
epoch: 160939, loss: 1.0510411262512207, rewards: -99, count: 72
success rate:  0.34
epoch: 160949, loss: 1.1316478252410889, rewards: -93, count: 93
success rate:  0.36
epoch: 160959, loss: 1.115088939666748, rewards: -106, count: 106
success rate:  0.38
epoch: 160969, loss: 0.8256886005401611, rewards: -90, count: 81
success rate:  0.36
epoch: 160979, loss: 0.7145569324493408, rewards: -53, count: 53
success rate:  0.37
epoch: 160989, loss: 0.6636018753051758, rewards: -56, count: 56
success rate:  0.33
epoch: 160999, loss: 1.0143530368804932, rewards: -332, count: 314
success rate:  0.32
epoch: 161009, loss: 1.0616331100463867, rewards: -264, count: 246
success rate:  0.29
epoch: 161019, loss: 0.0643215328454

success rate:  0.27
epoch: 161869, loss: 1.0792133808135986, rewards: -213, count: 204
success rate:  0.32
epoch: 161879, loss: 1.6442984342575073, rewards: -98, count: 98
success rate:  0.29
epoch: 161889, loss: 1.2044404745101929, rewards: -356, count: 320
success rate:  0.29
epoch: 161899, loss: 0.8344628810882568, rewards: -193, count: 184
success rate:  0.31
epoch: 161909, loss: 1.2064799070358276, rewards: -102, count: 93
success rate:  0.31
epoch: 161919, loss: 1.2013624906539917, rewards: -314, count: 305
success rate:  0.33
epoch: 161929, loss: 1.4473356008529663, rewards: -191, count: 191
success rate:  0.32
epoch: 161939, loss: 0.6745756268501282, rewards: -410, count: 374
success rate:  0.32
epoch: 161949, loss: 0.8173854947090149, rewards: -434, count: 407
success rate:  0.38
epoch: 161959, loss: 1.0861481428146362, rewards: -51, count: 51
success rate:  0.41
epoch: 161969, loss: 1.2605383396148682, rewards: -231, count: 204
success rate:  0.36
epoch: 161979, loss: 0.70395

success rate:  0.28
epoch: 162829, loss: 0.6062885522842407, rewards: -227, count: 191
success rate:  0.3
epoch: 162839, loss: 0.8963502049446106, rewards: -271, count: 262
success rate:  0.32
epoch: 162849, loss: 0.8561723232269287, rewards: -300, count: 291
success rate:  0.31
epoch: 162859, loss: 0.1672116369009018, rewards: -292, count: 274
success rate:  0.29
epoch: 162869, loss: 0.5702440142631531, rewards: -247, count: 211
success rate:  0.3
epoch: 162879, loss: 0.5034605860710144, rewards: -360, count: 333
success rate:  0.32
epoch: 162889, loss: 0.3630077540874481, rewards: -713, count: 623
success rate:  0.29
epoch: 162899, loss: 1.1724193096160889, rewards: -199, count: 199
success rate:  0.28
epoch: 162909, loss: 0.47412899136543274, rewards: -709, count: 646
success rate:  0.27
epoch: 162919, loss: 0.5164353847503662, rewards: -285, count: 267
success rate:  0.25
epoch: 162929, loss: 0.6725978255271912, rewards: -168, count: 168
success rate:  0.27
epoch: 162939, loss: 0.2

success rate:  0.24
epoch: 163789, loss: 1.023337483406067, rewards: -153, count: 144
success rate:  0.29
epoch: 163799, loss: 1.2117364406585693, rewards: -138, count: 138
success rate:  0.3
epoch: 163809, loss: 0.9123889803886414, rewards: -232, count: 232
success rate:  0.3
epoch: 163819, loss: 1.19906747341156, rewards: -282, count: 228
success rate:  0.32
epoch: 163829, loss: 0.9565175771713257, rewards: -187, count: 169
success rate:  0.29
epoch: 163839, loss: 0.7684558629989624, rewards: -361, count: 316
success rate:  0.3
epoch: 163849, loss: 0.7328207492828369, rewards: -140, count: 131
success rate:  0.31
epoch: 163859, loss: 0.7813609838485718, rewards: -252, count: 216
success rate:  0.31
epoch: 163869, loss: 1.3055453300476074, rewards: -114, count: 105
success rate:  0.3
epoch: 163879, loss: 1.2239869832992554, rewards: -140, count: 122
success rate:  0.31
epoch: 163889, loss: 1.0577162504196167, rewards: -73, count: 73
success rate:  0.27
epoch: 163899, loss: 0.771558821

success rate:  0.4
epoch: 164749, loss: 1.0424003601074219, rewards: -190, count: 172
success rate:  0.42
epoch: 164759, loss: 0.8744303584098816, rewards: -77, count: 77
success rate:  0.44
epoch: 164769, loss: 1.050145149230957, rewards: -213, count: 195
success rate:  0.41
epoch: 164779, loss: 0.609366774559021, rewards: -147, count: 147
success rate:  0.4
epoch: 164789, loss: 1.2329827547073364, rewards: -92, count: 92
success rate:  0.41
epoch: 164799, loss: 0.3801250755786896, rewards: -55, count: 55
success rate:  0.42
epoch: 164809, loss: 0.6721277832984924, rewards: -374, count: 356
success rate:  0.42
epoch: 164819, loss: 0.7621940970420837, rewards: -56, count: 56
success rate:  0.4
epoch: 164829, loss: 0.8305209875106812, rewards: -313, count: 250
success rate:  0.37
epoch: 164839, loss: 0.9258980751037598, rewards: -746, count: 701
success rate:  0.36
epoch: 164849, loss: 1.2404214143753052, rewards: -270, count: 225
success rate:  0.34
epoch: 164859, loss: 1.3137947320938

success rate:  0.33
epoch: 165719, loss: 0.3452886939048767, rewards: -125, count: 98
success rate:  0.35
epoch: 165729, loss: 0.5380212664604187, rewards: -385, count: 358
success rate:  0.32
epoch: 165739, loss: 0.443681538105011, rewards: -506, count: 461
success rate:  0.3
epoch: 165749, loss: 0.8333889245986938, rewards: -216, count: 207
success rate:  0.3
epoch: 165759, loss: 0.4448798894882202, rewards: -218, count: 182
success rate:  0.34
epoch: 165769, loss: 0.774213969707489, rewards: -96, count: 96
success rate:  0.3
epoch: 165779, loss: 0.11626327037811279, rewards: -32, count: 32
success rate:  0.31
epoch: 165789, loss: 0.14010176062583923, rewards: -51, count: 51
success rate:  0.28
epoch: 165799, loss: 0.4736666679382324, rewards: -221, count: 203
success rate:  0.3
epoch: 165809, loss: 0.2382107526063919, rewards: -210, count: 183
success rate:  0.3
epoch: 165819, loss: 0.8658667206764221, rewards: -347, count: 293
success rate:  0.29
epoch: 165829, loss: 0.716276049613

success rate:  0.29
epoch: 166689, loss: 0.8471489548683167, rewards: -312, count: 294
success rate:  0.3
epoch: 166699, loss: 0.7479628920555115, rewards: -79, count: 79
success rate:  0.28
epoch: 166709, loss: 0.6504618525505066, rewards: -194, count: 185
success rate:  0.29
epoch: 166719, loss: 0.847960889339447, rewards: -354, count: 336
success rate:  0.31
epoch: 166729, loss: 0.5517627596855164, rewards: -488, count: 452
success rate:  0.32
epoch: 166739, loss: 0.48538604378700256, rewards: -40, count: 40
success rate:  0.33
epoch: 166749, loss: 1.003377079963684, rewards: -79, count: 70
success rate:  0.36
epoch: 166759, loss: 0.7092623710632324, rewards: -193, count: 184
success rate:  0.4
epoch: 166769, loss: 0.6588515639305115, rewards: -269, count: 242
success rate:  0.4
epoch: 166779, loss: 0.5723856091499329, rewards: -423, count: 396
success rate:  0.43
epoch: 166789, loss: 0.9636397361755371, rewards: -53, count: 53
success rate:  0.4
epoch: 166799, loss: 0.8567563891410

success rate:  0.3
epoch: 167649, loss: 1.0652683973312378, rewards: -102, count: 102
success rate:  0.29
epoch: 167659, loss: 0.4299389123916626, rewards: -446, count: 374
success rate:  0.3
epoch: 167669, loss: 1.025325894355774, rewards: -308, count: 308
success rate:  0.3
epoch: 167679, loss: 0.5385432839393616, rewards: -272, count: 236
success rate:  0.33
epoch: 167689, loss: 1.233790397644043, rewards: -207, count: 207
success rate:  0.32
epoch: 167699, loss: 0.5921066403388977, rewards: -706, count: 616
success rate:  0.3
epoch: 167709, loss: 0.6830358505249023, rewards: -522, count: 450
success rate:  0.31
epoch: 167719, loss: 0.4899861812591553, rewards: -190, count: 154
success rate:  0.31
epoch: 167729, loss: 0.18843945860862732, rewards: -154, count: 136
success rate:  0.31
epoch: 167739, loss: 0.8000633716583252, rewards: -487, count: 433
success rate:  0.31
epoch: 167749, loss: 1.281179666519165, rewards: -365, count: 311
success rate:  0.33
epoch: 167759, loss: 0.965907

success rate:  0.36
epoch: 168609, loss: 1.4540555477142334, rewards: -269, count: 251
success rate:  0.4
epoch: 168619, loss: 0.5473288893699646, rewards: -101, count: 83
success rate:  0.38
epoch: 168629, loss: 0.8308241367340088, rewards: -55, count: 55
success rate:  0.36
epoch: 168639, loss: 1.167622685432434, rewards: -60, count: 60
success rate:  0.4
epoch: 168649, loss: 1.0862191915512085, rewards: -97, count: 97
success rate:  0.44
epoch: 168659, loss: 1.3451080322265625, rewards: -81, count: 72
success rate:  0.48
epoch: 168669, loss: 1.0760458707809448, rewards: -48, count: 48
success rate:  0.48
epoch: 168679, loss: 0.8770975470542908, rewards: -404, count: 377
success rate:  0.47
epoch: 168689, loss: 1.004064917564392, rewards: -472, count: 418
success rate:  0.43
epoch: 168699, loss: 0.9512566328048706, rewards: -90, count: 90
success rate:  0.46
epoch: 168709, loss: 0.6962192058563232, rewards: -346, count: 319
success rate:  0.45
epoch: 168719, loss: 0.7072116732597351,

success rate:  0.44
epoch: 169569, loss: 1.4305332899093628, rewards: -35, count: 35
success rate:  0.39
epoch: 169579, loss: 1.173803448677063, rewards: -114, count: 105
success rate:  0.41
epoch: 169589, loss: 1.5708000659942627, rewards: -74, count: 74
success rate:  0.38
epoch: 169599, loss: 1.313584327697754, rewards: -349, count: 304
success rate:  0.35
epoch: 169609, loss: 1.3648067712783813, rewards: -191, count: 182
success rate:  0.36
epoch: 169619, loss: 1.288368582725525, rewards: -64, count: 64
success rate:  0.4
epoch: 169629, loss: 1.4041846990585327, rewards: -120, count: 111
success rate:  0.39
epoch: 169639, loss: 1.423546552658081, rewards: -79, count: 79
success rate:  0.4
epoch: 169649, loss: 0.3412420451641083, rewards: -207, count: 162
success rate:  0.36
epoch: 169659, loss: 1.4474714994430542, rewards: -193, count: 175
success rate:  0.35
epoch: 169669, loss: 0.8875368237495422, rewards: -373, count: 346
success rate:  0.39
epoch: 169679, loss: 1.73537993431091

success rate:  0.29
epoch: 170529, loss: 1.1998212337493896, rewards: -75, count: 75
success rate:  0.31
epoch: 170539, loss: 0.44379693269729614, rewards: -61, count: 61
success rate:  0.3
epoch: 170549, loss: 1.1476179361343384, rewards: -170, count: 161
success rate:  0.31
epoch: 170559, loss: 0.6036703586578369, rewards: -818, count: 773
success rate:  0.32
epoch: 170569, loss: 0.9735384583473206, rewards: -76, count: 76
success rate:  0.33
epoch: 170579, loss: 1.439592957496643, rewards: -215, count: 206
success rate:  0.33
epoch: 170589, loss: 1.4545260667800903, rewards: -262, count: 235
success rate:  0.33
epoch: 170599, loss: 1.561795711517334, rewards: -279, count: 234
success rate:  0.34
epoch: 170609, loss: 1.122239589691162, rewards: -291, count: 282
success rate:  0.35
epoch: 170619, loss: 1.1570953130722046, rewards: -126, count: 126
success rate:  0.35
epoch: 170629, loss: 1.329257845878601, rewards: -166, count: 157
success rate:  0.34
epoch: 170639, loss: 0.8036001920

success rate:  0.26
epoch: 171499, loss: 0.8543873429298401, rewards: -328, count: 301
success rate:  0.26
epoch: 171509, loss: 0.2531314492225647, rewards: -208, count: 199
success rate:  0.27
epoch: 171519, loss: 0.42074713110923767, rewards: -255, count: 237
success rate:  0.28
epoch: 171529, loss: 1.5666276216506958, rewards: -232, count: 196
success rate:  0.29
epoch: 171539, loss: 1.2030857801437378, rewards: -90, count: 90
success rate:  0.31
epoch: 171549, loss: 1.5844857692718506, rewards: -56, count: 56
success rate:  0.29
epoch: 171559, loss: 1.1707618236541748, rewards: -190, count: 181
success rate:  0.3
epoch: 171569, loss: 1.0604568719863892, rewards: -174, count: 174
success rate:  0.3
epoch: 171579, loss: 1.048133134841919, rewards: -211, count: 193
success rate:  0.32
epoch: 171589, loss: 0.6527907848358154, rewards: -311, count: 293
success rate:  0.33
epoch: 171599, loss: 1.03445565700531, rewards: -344, count: 335
success rate:  0.33
epoch: 171609, loss: 0.85394418

success rate:  0.44
epoch: 172449, loss: 0.8649481534957886, rewards: -270, count: 261
success rate:  0.41
epoch: 172459, loss: 1.0382574796676636, rewards: -138, count: 129
success rate:  0.41
epoch: 172469, loss: 0.7557976841926575, rewards: -446, count: 401
success rate:  0.35
epoch: 172479, loss: 1.217430591583252, rewards: -143, count: 107
success rate:  0.37
epoch: 172489, loss: 1.408576488494873, rewards: -138, count: 120
success rate:  0.4
epoch: 172499, loss: 0.7900510430335999, rewards: -66, count: 66
success rate:  0.38
epoch: 172509, loss: 1.1381326913833618, rewards: -156, count: 147
success rate:  0.38
epoch: 172519, loss: 0.276779443025589, rewards: -136, count: 118
success rate:  0.38
epoch: 172529, loss: 1.1209335327148438, rewards: -345, count: 327
success rate:  0.38
epoch: 172539, loss: 0.6566846966743469, rewards: -55, count: 46
success rate:  0.41
epoch: 172549, loss: 0.23532472550868988, rewards: -253, count: 217
success rate:  0.41
epoch: 172559, loss: 0.9753776

success rate:  0.33
epoch: 173409, loss: 0.704705536365509, rewards: -36, count: 36
success rate:  0.33
epoch: 173419, loss: 2.3117780685424805, rewards: -123, count: 114
success rate:  0.32
epoch: 173429, loss: 0.9803593158721924, rewards: -222, count: 204
success rate:  0.31
epoch: 173439, loss: 1.7222113609313965, rewards: -195, count: 168
success rate:  0.31
epoch: 173449, loss: 0.807429313659668, rewards: -348, count: 294
success rate:  0.33
epoch: 173459, loss: 1.2795923948287964, rewards: -101, count: 101
success rate:  0.36
epoch: 173469, loss: 1.5014644861221313, rewards: -212, count: 203
success rate:  0.31
epoch: 173479, loss: 1.336500883102417, rewards: -226, count: 217
success rate:  0.34
epoch: 173489, loss: 1.370039939880371, rewards: -136, count: 118
success rate:  0.34
epoch: 173499, loss: 0.9891520738601685, rewards: -158, count: 140
success rate:  0.38
epoch: 173509, loss: 1.042220950126648, rewards: -61, count: 61
success rate:  0.37
epoch: 173519, loss: 0.847532927

success rate:  0.29
epoch: 174369, loss: 1.440226435661316, rewards: -140, count: 131
success rate:  0.33
epoch: 174379, loss: 1.1913902759552002, rewards: -119, count: 119
success rate:  0.35
epoch: 174389, loss: 1.6656607389450073, rewards: -109, count: 109
success rate:  0.34
epoch: 174399, loss: 1.426196813583374, rewards: -78, count: 78
success rate:  0.31
epoch: 174409, loss: 1.6385376453399658, rewards: -97, count: 79
success rate:  0.32
epoch: 174419, loss: 0.9127508401870728, rewards: -132, count: 114
success rate:  0.33
epoch: 174429, loss: 1.5070879459381104, rewards: -175, count: 166
success rate:  0.33
epoch: 174439, loss: 1.8219587802886963, rewards: -152, count: 134
success rate:  0.34
epoch: 174449, loss: 0.8282352089881897, rewards: -121, count: 112
success rate:  0.33
epoch: 174459, loss: 1.3549941778182983, rewards: -144, count: 144
success rate:  0.34
epoch: 174469, loss: 0.7721885442733765, rewards: -967, count: 895
success rate:  0.3
epoch: 174479, loss: 1.4785249

success rate:  0.34
epoch: 175319, loss: 0.61543869972229, rewards: -171, count: 144
success rate:  0.34
epoch: 175329, loss: 0.7852687239646912, rewards: -342, count: 333
success rate:  0.33
epoch: 175339, loss: 1.7968254089355469, rewards: -199, count: 154
success rate:  0.32
epoch: 175349, loss: 1.0828089714050293, rewards: -95, count: 95
success rate:  0.35
epoch: 175359, loss: 1.357047438621521, rewards: -177, count: 168
success rate:  0.34
epoch: 175369, loss: 1.350059151649475, rewards: -161, count: 161
success rate:  0.33
epoch: 175379, loss: 1.580324649810791, rewards: -245, count: 236
success rate:  0.25
epoch: 175389, loss: 1.7664649486541748, rewards: -287, count: 260
success rate:  0.25
epoch: 175399, loss: 2.0732951164245605, rewards: -121, count: 103
success rate:  0.26
epoch: 175409, loss: 1.570061206817627, rewards: -378, count: 351
success rate:  0.29
epoch: 175419, loss: 1.5400941371917725, rewards: -522, count: 468
success rate:  0.29
epoch: 175429, loss: 0.81847763

success rate:  0.4
epoch: 176279, loss: 0.6459442377090454, rewards: -104, count: 95
success rate:  0.37
epoch: 176289, loss: 1.1746013164520264, rewards: -197, count: 179
success rate:  0.37
epoch: 176299, loss: 0.8844607472419739, rewards: -341, count: 296
success rate:  0.37
epoch: 176309, loss: 0.6285253167152405, rewards: -109, count: 100
success rate:  0.39
epoch: 176319, loss: 1.2574436664581299, rewards: -77, count: 77
success rate:  0.35
epoch: 176329, loss: 0.6553652882575989, rewards: -217, count: 199
success rate:  0.32
epoch: 176339, loss: 1.0084201097488403, rewards: -545, count: 491
success rate:  0.3
epoch: 176349, loss: 0.7348234057426453, rewards: -438, count: 366
success rate:  0.29
epoch: 176359, loss: 1.568952202796936, rewards: -192, count: 192
success rate:  0.32
epoch: 176369, loss: 0.8400167226791382, rewards: -654, count: 591
success rate:  0.32
epoch: 176379, loss: 2.3779470920562744, rewards: -40, count: 40
success rate:  0.34
epoch: 176389, loss: 0.69765359

success rate:  0.25
epoch: 177249, loss: 1.4986072778701782, rewards: -188, count: 179
success rate:  0.24
epoch: 177259, loss: 1.2030092477798462, rewards: -169, count: 160
success rate:  0.23
epoch: 177269, loss: 1.0116188526153564, rewards: -206, count: 197
success rate:  0.25
epoch: 177279, loss: 0.9395022392272949, rewards: -275, count: 230
success rate:  0.26
epoch: 177289, loss: 1.1427462100982666, rewards: -271, count: 253
success rate:  0.27
epoch: 177299, loss: 1.178379774093628, rewards: -121, count: 112
success rate:  0.27
epoch: 177309, loss: 0.7833399176597595, rewards: -175, count: 157
success rate:  0.24
epoch: 177319, loss: 0.9479647874832153, rewards: -190, count: 172
success rate:  0.26
epoch: 177329, loss: 0.9555922746658325, rewards: -40, count: 40
success rate:  0.28
epoch: 177339, loss: 1.3735806941986084, rewards: -159, count: 150
success rate:  0.29
epoch: 177349, loss: 0.5245923399925232, rewards: -370, count: 307
success rate:  0.29
epoch: 177359, loss: 1.202

success rate:  0.32
epoch: 178209, loss: 1.2815487384796143, rewards: -62, count: 62
success rate:  0.31
epoch: 178219, loss: 0.6595640182495117, rewards: -665, count: 629
success rate:  0.32
epoch: 178229, loss: 1.2854424715042114, rewards: -221, count: 221
success rate:  0.34
epoch: 178239, loss: 0.9367265701293945, rewards: -40, count: 40
success rate:  0.33
epoch: 178249, loss: 1.3062245845794678, rewards: -135, count: 117
success rate:  0.33
epoch: 178259, loss: 1.2117003202438354, rewards: -85, count: 85
success rate:  0.34
epoch: 178269, loss: 1.405113697052002, rewards: -70, count: 70
success rate:  0.33
epoch: 178279, loss: 0.6908165216445923, rewards: -109, count: 109
success rate:  0.33
epoch: 178289, loss: 0.5724264979362488, rewards: -154, count: 127
success rate:  0.32
epoch: 178299, loss: 1.0601526498794556, rewards: -155, count: 155
success rate:  0.34
epoch: 178309, loss: 0.9930703639984131, rewards: -317, count: 299
success rate:  0.31
epoch: 178319, loss: 0.851657748

success rate:  0.3
epoch: 179169, loss: 0.5249980092048645, rewards: -91, count: 82
success rate:  0.29
epoch: 179179, loss: 1.2015501260757446, rewards: -219, count: 219
success rate:  0.3
epoch: 179189, loss: 0.7600467205047607, rewards: -50, count: 50
success rate:  0.31
epoch: 179199, loss: 0.5501031279563904, rewards: -799, count: 745
success rate:  0.33
epoch: 179209, loss: 1.0338176488876343, rewards: -350, count: 332
success rate:  0.37
epoch: 179219, loss: 0.7203202247619629, rewards: -77, count: 77
success rate:  0.4
epoch: 179229, loss: 1.0030277967453003, rewards: -89, count: 89
success rate:  0.34
epoch: 179239, loss: 0.9620578289031982, rewards: -229, count: 211
success rate:  0.34
epoch: 179249, loss: 0.840309739112854, rewards: -67, count: 67
success rate:  0.34
epoch: 179259, loss: 1.1967217922210693, rewards: -223, count: 196
success rate:  0.34
epoch: 179269, loss: 0.6991295218467712, rewards: -681, count: 627
success rate:  0.32
epoch: 179279, loss: 0.86394262313842

success rate:  0.25
epoch: 180139, loss: 1.4938974380493164, rewards: -181, count: 163
success rate:  0.25
epoch: 180149, loss: 1.0474716424942017, rewards: -115, count: 115
success rate:  0.26
epoch: 180159, loss: 1.2581486701965332, rewards: -503, count: 467
success rate:  0.28
epoch: 180169, loss: 1.1516001224517822, rewards: -311, count: 266
success rate:  0.29
epoch: 180179, loss: 1.2670127153396606, rewards: -185, count: 176
success rate:  0.29
epoch: 180189, loss: 1.428501009941101, rewards: -348, count: 294
success rate:  0.3
epoch: 180199, loss: 0.8410684466362, rewards: -109, count: 100
success rate:  0.31
epoch: 180209, loss: 0.9335286617279053, rewards: -131, count: 122
success rate:  0.29
epoch: 180219, loss: 0.890170156955719, rewards: -387, count: 378
success rate:  0.31
epoch: 180229, loss: 1.1792668104171753, rewards: -320, count: 302
success rate:  0.31
epoch: 180239, loss: 1.6217516660690308, rewards: -74, count: 65
success rate:  0.28
epoch: 180249, loss: 1.01587092

success rate:  0.36
epoch: 181099, loss: 1.2978323698043823, rewards: -105, count: 105
success rate:  0.35
epoch: 181109, loss: 0.7584010362625122, rewards: -179, count: 170
success rate:  0.34
epoch: 181119, loss: 0.625307559967041, rewards: -166, count: 139
success rate:  0.31
epoch: 181129, loss: 1.4447071552276611, rewards: -86, count: 77
success rate:  0.31
epoch: 181139, loss: 1.0413901805877686, rewards: -138, count: 120
success rate:  0.32
epoch: 181149, loss: 0.6547139286994934, rewards: -177, count: 168
success rate:  0.34
epoch: 181159, loss: 1.4234472513198853, rewards: -94, count: 76
success rate:  0.33
epoch: 181169, loss: 1.238084316253662, rewards: -52, count: 52
success rate:  0.34
epoch: 181179, loss: 1.277162790298462, rewards: -147, count: 147
success rate:  0.33
epoch: 181189, loss: 0.6997466683387756, rewards: -99, count: 81
success rate:  0.34
epoch: 181199, loss: 0.5672481656074524, rewards: -111, count: 111
success rate:  0.35
epoch: 181209, loss: 0.69745689630

success rate:  0.39
epoch: 182069, loss: 0.4908699691295624, rewards: -169, count: 169
success rate:  0.4
epoch: 182079, loss: 0.6519430875778198, rewards: -137, count: 119
success rate:  0.41
epoch: 182089, loss: 0.7146487236022949, rewards: -510, count: 465
success rate:  0.38
epoch: 182099, loss: 0.6909971833229065, rewards: -109, count: 100
success rate:  0.37
epoch: 182109, loss: 0.7272318005561829, rewards: -168, count: 159
success rate:  0.35
epoch: 182119, loss: 0.3734544515609741, rewards: -321, count: 285
success rate:  0.37
epoch: 182129, loss: 0.5671296119689941, rewards: -575, count: 530
success rate:  0.34
epoch: 182139, loss: 1.241440773010254, rewards: -59, count: 41
success rate:  0.32
epoch: 182149, loss: 1.0307384729385376, rewards: -77, count: 59
success rate:  0.34
epoch: 182159, loss: 0.585746705532074, rewards: -404, count: 350
success rate:  0.31
epoch: 182169, loss: 0.9706758260726929, rewards: -65, count: 65
success rate:  0.3
epoch: 182179, loss: 0.4070275127

success rate:  0.32
epoch: 183029, loss: 1.353117823600769, rewards: -158, count: 149
success rate:  0.31
epoch: 183039, loss: 0.9531747698783875, rewards: -319, count: 292
success rate:  0.29
epoch: 183049, loss: 1.5363277196884155, rewards: -180, count: 171
success rate:  0.26
epoch: 183059, loss: 0.7190654277801514, rewards: -323, count: 296
success rate:  0.26
epoch: 183069, loss: 1.6701053380966187, rewards: -89, count: 89
success rate:  0.26
epoch: 183079, loss: 2.5030176639556885, rewards: -85, count: 76
success rate:  0.29
epoch: 183089, loss: 0.8002198338508606, rewards: -271, count: 253
success rate:  0.31
epoch: 183099, loss: 1.1633468866348267, rewards: -107, count: 89
success rate:  0.36
epoch: 183109, loss: 1.3182849884033203, rewards: -146, count: 137
success rate:  0.34
epoch: 183119, loss: 1.144302487373352, rewards: -382, count: 355
success rate:  0.35
epoch: 183129, loss: 1.3530536890029907, rewards: -94, count: 85
success rate:  0.37
epoch: 183139, loss: 1.312499165

success rate:  0.37
epoch: 183989, loss: 1.1771180629730225, rewards: -207, count: 180
success rate:  0.38
epoch: 183999, loss: 0.9531898498535156, rewards: -393, count: 375
success rate:  0.38
epoch: 184009, loss: 1.1569101810455322, rewards: -62, count: 62
success rate:  0.34
epoch: 184019, loss: 1.0744352340698242, rewards: -594, count: 585
success rate:  0.34
epoch: 184029, loss: 1.5069869756698608, rewards: -121, count: 112
success rate:  0.37
epoch: 184039, loss: 1.2673556804656982, rewards: -242, count: 242
success rate:  0.38
epoch: 184049, loss: 0.874650239944458, rewards: -81, count: 72
success rate:  0.35
epoch: 184059, loss: 0.9230116605758667, rewards: -462, count: 417
success rate:  0.36
epoch: 184069, loss: 1.4940986633300781, rewards: -364, count: 337
success rate:  0.33
epoch: 184079, loss: 1.078150749206543, rewards: -207, count: 189
success rate:  0.36
epoch: 184089, loss: 1.793838620185852, rewards: -108, count: 99
success rate:  0.37
epoch: 184099, loss: 0.62640070

success rate:  0.32
epoch: 184949, loss: 0.6948689818382263, rewards: -327, count: 273
success rate:  0.32
epoch: 184959, loss: 0.6857361793518066, rewards: -269, count: 251
success rate:  0.3
epoch: 184969, loss: 0.7349290251731873, rewards: -387, count: 369
success rate:  0.32
epoch: 184979, loss: 0.2996009290218353, rewards: -141, count: 123
success rate:  0.35
epoch: 184989, loss: 0.8613804578781128, rewards: -122, count: 122
success rate:  0.33
epoch: 184999, loss: 0.5879672169685364, rewards: -606, count: 552
success rate:  0.32
epoch: 185009, loss: 0.6258407235145569, rewards: -131, count: 131
success rate:  0.36
epoch: 185019, loss: 0.8052046298980713, rewards: -183, count: 165
success rate:  0.34
epoch: 185029, loss: 1.3566757440567017, rewards: -70, count: 70
success rate:  0.31
epoch: 185039, loss: 0.8790337443351746, rewards: -614, count: 578
success rate:  0.3
epoch: 185049, loss: 1.5074819326400757, rewards: -84, count: 84
success rate:  0.34
epoch: 185059, loss: 1.280428

success rate:  0.26
epoch: 185909, loss: 0.3915267884731293, rewards: -50, count: 50
success rate:  0.3
epoch: 185919, loss: 0.7767554521560669, rewards: -119, count: 119
success rate:  0.31
epoch: 185929, loss: 0.5316442251205444, rewards: -325, count: 298
success rate:  0.32
epoch: 185939, loss: 0.7237447500228882, rewards: -151, count: 151
success rate:  0.33
epoch: 185949, loss: 0.6821212768554688, rewards: -57, count: 57
success rate:  0.32
epoch: 185959, loss: 0.5474762916564941, rewards: -507, count: 462
success rate:  0.33
epoch: 185969, loss: 0.7722669839859009, rewards: -83, count: 83
success rate:  0.35
epoch: 185979, loss: 1.7264784574508667, rewards: -82, count: 73
success rate:  0.33
epoch: 185989, loss: 1.0999878644943237, rewards: -128, count: 119
success rate:  0.33
epoch: 185999, loss: 1.4543545246124268, rewards: -182, count: 146
success rate:  0.33
epoch: 186009, loss: 1.1812191009521484, rewards: -43, count: 43
success rate:  0.31
epoch: 186019, loss: 0.17459066212

success rate:  0.36
epoch: 186869, loss: 1.1446160078048706, rewards: -79, count: 79
success rate:  0.32
epoch: 186879, loss: 1.309783935546875, rewards: -126, count: 108
success rate:  0.34
epoch: 186889, loss: 1.0505926609039307, rewards: -197, count: 188
success rate:  0.36
epoch: 186899, loss: 0.6968840956687927, rewards: -72, count: 72
success rate:  0.38
epoch: 186909, loss: 0.894051194190979, rewards: -88, count: 88
success rate:  0.35
epoch: 186919, loss: 0.9143046736717224, rewards: -309, count: 282
success rate:  0.33
epoch: 186929, loss: 1.0580893754959106, rewards: -247, count: 229
success rate:  0.35
epoch: 186939, loss: 0.6495575308799744, rewards: -700, count: 664
success rate:  0.33
epoch: 186949, loss: 0.46675917506217957, rewards: -326, count: 308
success rate:  0.31
epoch: 186959, loss: 1.0902047157287598, rewards: -121, count: 112
success rate:  0.28
epoch: 186969, loss: 1.1860733032226562, rewards: -133, count: 124
success rate:  0.32
epoch: 186979, loss: 1.4148989

success rate:  0.35
epoch: 187829, loss: 0.9985389113426208, rewards: -265, count: 247
success rate:  0.33
epoch: 187839, loss: 0.4516123831272125, rewards: -66, count: 66
success rate:  0.3
epoch: 187849, loss: 0.5463129281997681, rewards: -86, count: 86
success rate:  0.32
epoch: 187859, loss: 1.1390161514282227, rewards: -341, count: 332
success rate:  0.32
epoch: 187869, loss: 0.5824426412582397, rewards: -42, count: 42
success rate:  0.33
epoch: 187879, loss: 0.6412469744682312, rewards: -75, count: 75
success rate:  0.35
epoch: 187889, loss: 0.7873529195785522, rewards: -291, count: 273
success rate:  0.31
epoch: 187899, loss: 0.8443810343742371, rewards: -72, count: 72
success rate:  0.31
epoch: 187909, loss: 0.5058224201202393, rewards: -168, count: 141
success rate:  0.34
epoch: 187919, loss: 0.6569682359695435, rewards: -98, count: 98
success rate:  0.32
epoch: 187929, loss: 0.5049045085906982, rewards: -163, count: 154
success rate:  0.3
epoch: 187939, loss: 1.04899811744689

success rate:  0.29
epoch: 188789, loss: 0.2527691125869751, rewards: -310, count: 283
success rate:  0.26
epoch: 188799, loss: 1.0846530199050903, rewards: -179, count: 161
success rate:  0.26
epoch: 188809, loss: 0.8122281432151794, rewards: -238, count: 220
success rate:  0.27
epoch: 188819, loss: 0.46791377663612366, rewards: -56, count: 56
success rate:  0.27
epoch: 188829, loss: 0.5817249417304993, rewards: -98, count: 98
success rate:  0.24
epoch: 188839, loss: 0.7874829173088074, rewards: -65, count: 65
success rate:  0.29
epoch: 188849, loss: 0.8087942600250244, rewards: -91, count: 82
success rate:  0.31
epoch: 188859, loss: 1.2397760152816772, rewards: -188, count: 161
success rate:  0.33
epoch: 188869, loss: 0.8645707368850708, rewards: -321, count: 303
success rate:  0.35
epoch: 188879, loss: 0.8801580667495728, rewards: -58, count: 58
success rate:  0.35
epoch: 188889, loss: 0.7878193259239197, rewards: -150, count: 123
success rate:  0.35
epoch: 188899, loss: 0.983907639

success rate:  0.33
epoch: 189749, loss: 0.5108080506324768, rewards: -157, count: 148
success rate:  0.31
epoch: 189759, loss: 1.1157960891723633, rewards: -286, count: 259
success rate:  0.32
epoch: 189769, loss: 0.4199488162994385, rewards: -353, count: 317
success rate:  0.3
epoch: 189779, loss: 0.516338586807251, rewards: -295, count: 277
success rate:  0.3
epoch: 189789, loss: 1.0401365756988525, rewards: -296, count: 269
success rate:  0.28
epoch: 189799, loss: 0.6894612312316895, rewards: -709, count: 646
success rate:  0.31
epoch: 189809, loss: 1.2821468114852905, rewards: -57, count: 57
success rate:  0.34
epoch: 189819, loss: 1.0178108215332031, rewards: -113, count: 113
success rate:  0.34
epoch: 189829, loss: 0.7857308387756348, rewards: -156, count: 147
success rate:  0.34
epoch: 189839, loss: 0.8195525407791138, rewards: -235, count: 235
success rate:  0.34
epoch: 189849, loss: 1.4218649864196777, rewards: -83, count: 65
success rate:  0.35
epoch: 189859, loss: 0.5229820

success rate:  0.33
epoch: 190709, loss: 1.0798407793045044, rewards: -78, count: 78
success rate:  0.33
epoch: 190719, loss: 0.5795580148696899, rewards: -65, count: 65
success rate:  0.32
epoch: 190729, loss: 0.40339773893356323, rewards: -80, count: 80
success rate:  0.34
epoch: 190739, loss: 0.7681485414505005, rewards: -246, count: 228
success rate:  0.35
epoch: 190749, loss: 0.6912514567375183, rewards: -72, count: 72
success rate:  0.34
epoch: 190759, loss: 0.6450847387313843, rewards: -350, count: 314
success rate:  0.33
epoch: 190769, loss: 1.2874810695648193, rewards: -73, count: 73
success rate:  0.35
epoch: 190779, loss: 1.357643723487854, rewards: -56, count: 47
success rate:  0.36
epoch: 190789, loss: 0.28083130717277527, rewards: -33, count: 33
success rate:  0.36
epoch: 190799, loss: 0.9548541903495789, rewards: -326, count: 290
success rate:  0.35
epoch: 190809, loss: 1.168735146522522, rewards: -112, count: 94
success rate:  0.35
epoch: 190819, loss: 1.596394419670105

success rate:  0.36
epoch: 191669, loss: 1.1066358089447021, rewards: -150, count: 150
success rate:  0.37
epoch: 191679, loss: 0.7340168356895447, rewards: -318, count: 273
success rate:  0.39
epoch: 191689, loss: 1.1142678260803223, rewards: -79, count: 79
success rate:  0.4
epoch: 191699, loss: 0.5377746224403381, rewards: -731, count: 641
success rate:  0.35
epoch: 191709, loss: 1.1336849927902222, rewards: -117, count: 99
success rate:  0.38
epoch: 191719, loss: 1.1932487487792969, rewards: -332, count: 314
success rate:  0.4
epoch: 191729, loss: 1.1167488098144531, rewards: -164, count: 146
success rate:  0.41
epoch: 191739, loss: 1.513545274734497, rewards: -178, count: 142
success rate:  0.37
epoch: 191749, loss: 1.5208464860916138, rewards: -326, count: 299
success rate:  0.35
epoch: 191759, loss: 1.1112060546875, rewards: -181, count: 154
success rate:  0.38
epoch: 191769, loss: 1.1624622344970703, rewards: -221, count: 212
success rate:  0.38
epoch: 191779, loss: 1.117827534

success rate:  0.36
epoch: 192619, loss: 0.9210789203643799, rewards: -444, count: 408
success rate:  0.37
epoch: 192629, loss: 1.0659781694412231, rewards: -187, count: 169
success rate:  0.37
epoch: 192639, loss: 1.8391649723052979, rewards: -99, count: 99
success rate:  0.39
epoch: 192649, loss: 0.4994739592075348, rewards: -144, count: 135
success rate:  0.4
epoch: 192659, loss: 1.1811941862106323, rewards: -140, count: 131
success rate:  0.35
epoch: 192669, loss: 1.3468214273452759, rewards: -145, count: 127
success rate:  0.36
epoch: 192679, loss: 1.137448787689209, rewards: -130, count: 121
success rate:  0.38
epoch: 192689, loss: 0.961163341999054, rewards: -96, count: 96
success rate:  0.39
epoch: 192699, loss: 1.3773727416992188, rewards: -195, count: 195
success rate:  0.4
epoch: 192709, loss: 0.8940066695213318, rewards: -397, count: 343
success rate:  0.41
epoch: 192719, loss: 1.332863688468933, rewards: -150, count: 150
success rate:  0.41
epoch: 192729, loss: 1.122176766

success rate:  0.35
epoch: 193579, loss: 0.5468935966491699, rewards: -55, count: 55
success rate:  0.33
epoch: 193589, loss: 1.1630185842514038, rewards: -101, count: 101
success rate:  0.31
epoch: 193599, loss: 1.5811398029327393, rewards: -238, count: 220
success rate:  0.31
epoch: 193609, loss: 1.499466896057129, rewards: -240, count: 222
success rate:  0.34
epoch: 193619, loss: 1.1311511993408203, rewards: -106, count: 106
success rate:  0.35
epoch: 193629, loss: 1.1434756517410278, rewards: -42, count: 42
success rate:  0.35
epoch: 193639, loss: 0.9663587808609009, rewards: -186, count: 177
success rate:  0.32
epoch: 193649, loss: 1.0838879346847534, rewards: -365, count: 320
success rate:  0.3
epoch: 193659, loss: 1.0062158107757568, rewards: -350, count: 314
success rate:  0.29
epoch: 193669, loss: 0.5340113043785095, rewards: -167, count: 158
success rate:  0.28
epoch: 193679, loss: 1.282105565071106, rewards: -403, count: 358
success rate:  0.26
epoch: 193689, loss: 1.3763326

success rate:  0.29
epoch: 194539, loss: 0.6310946941375732, rewards: -49, count: 49
success rate:  0.32
epoch: 194549, loss: 1.212327241897583, rewards: -263, count: 236
success rate:  0.3
epoch: 194559, loss: 0.7581688165664673, rewards: -630, count: 549
success rate:  0.33
epoch: 194569, loss: 0.9519805312156677, rewards: -250, count: 214
success rate:  0.33
epoch: 194579, loss: 0.6643631458282471, rewards: -425, count: 371
success rate:  0.34
epoch: 194589, loss: 1.5212604999542236, rewards: -279, count: 261
success rate:  0.35
epoch: 194599, loss: 0.9946507215499878, rewards: -229, count: 211
success rate:  0.35
epoch: 194609, loss: 1.607279658317566, rewards: -34, count: 34
success rate:  0.34
epoch: 194619, loss: 1.6802525520324707, rewards: -110, count: 92
success rate:  0.34
epoch: 194629, loss: 1.453918218612671, rewards: -348, count: 330
success rate:  0.3
epoch: 194639, loss: 1.0532556772232056, rewards: -468, count: 441
success rate:  0.3
epoch: 194649, loss: 0.99845129251

success rate:  0.4
epoch: 195499, loss: 1.1597262620925903, rewards: -402, count: 384
success rate:  0.39
epoch: 195509, loss: 1.15739905834198, rewards: -83, count: 83
success rate:  0.4
epoch: 195519, loss: 1.8667904138565063, rewards: -50, count: 50
success rate:  0.38
epoch: 195529, loss: 1.7602285146713257, rewards: -113, count: 113
success rate:  0.38
epoch: 195539, loss: 1.2760506868362427, rewards: -49, count: 49
success rate:  0.36
epoch: 195549, loss: 1.442826747894287, rewards: -165, count: 156
success rate:  0.35
epoch: 195559, loss: 0.8838554620742798, rewards: -141, count: 132
success rate:  0.32
epoch: 195569, loss: 1.3171907663345337, rewards: -206, count: 170
success rate:  0.32
epoch: 195579, loss: 1.3421698808670044, rewards: -112, count: 94
success rate:  0.35
epoch: 195589, loss: 0.7817897200584412, rewards: -52, count: 52
success rate:  0.34
epoch: 195599, loss: 1.2750749588012695, rewards: -120, count: 111
success rate:  0.34
epoch: 195609, loss: 1.25224828720092

success rate:  0.36
epoch: 196459, loss: 1.1784703731536865, rewards: -44, count: 35
success rate:  0.38
epoch: 196469, loss: 1.45524001121521, rewards: -137, count: 128
success rate:  0.41
epoch: 196479, loss: 1.3714134693145752, rewards: -236, count: 209
success rate:  0.43
epoch: 196489, loss: 1.2389320135116577, rewards: -403, count: 331
success rate:  0.41
epoch: 196499, loss: 1.6032052040100098, rewards: -107, count: 98
success rate:  0.44
epoch: 196509, loss: 1.296581745147705, rewards: -240, count: 222
success rate:  0.4
epoch: 196519, loss: 1.573933720588684, rewards: -207, count: 198
success rate:  0.37
epoch: 196529, loss: 1.7487331628799438, rewards: -60, count: 60
success rate:  0.38
epoch: 196539, loss: 1.310745358467102, rewards: -207, count: 189
success rate:  0.35
epoch: 196549, loss: 1.129062533378601, rewards: -54, count: 45
success rate:  0.37
epoch: 196559, loss: 1.273952841758728, rewards: -349, count: 322
success rate:  0.34
epoch: 196569, loss: 1.427671909332275

success rate:  0.39
epoch: 197419, loss: 0.8591768145561218, rewards: -43, count: 43
success rate:  0.39
epoch: 197429, loss: 0.47886401414871216, rewards: -56, count: 56
success rate:  0.37
epoch: 197439, loss: 1.043211579322815, rewards: -71, count: 71
success rate:  0.38
epoch: 197449, loss: 0.8529098629951477, rewards: -121, count: 112
success rate:  0.38
epoch: 197459, loss: 1.076446533203125, rewards: -57, count: 48
success rate:  0.4
epoch: 197469, loss: 0.7549585103988647, rewards: -62, count: 62
success rate:  0.39
epoch: 197479, loss: 0.9472728371620178, rewards: -280, count: 244
success rate:  0.4
epoch: 197489, loss: 0.7428974509239197, rewards: -182, count: 173
success rate:  0.41
epoch: 197499, loss: 0.8021920919418335, rewards: -78, count: 69
success rate:  0.4
epoch: 197509, loss: 0.7602431774139404, rewards: -477, count: 441
success rate:  0.4
epoch: 197519, loss: 1.4919967651367188, rewards: -153, count: 135
success rate:  0.35
epoch: 197529, loss: 0.8581448793411255,

success rate:  0.28
epoch: 198389, loss: 1.457705020904541, rewards: -318, count: 300
success rate:  0.31
epoch: 198399, loss: 1.099536418914795, rewards: -171, count: 171
success rate:  0.31
epoch: 198409, loss: 1.0310027599334717, rewards: -115, count: 115
success rate:  0.32
epoch: 198419, loss: 1.3522417545318604, rewards: -82, count: 73
success rate:  0.32
epoch: 198429, loss: 1.411118984222412, rewards: -154, count: 118
success rate:  0.33
epoch: 198439, loss: 1.0119622945785522, rewards: -401, count: 374
success rate:  0.37
epoch: 198449, loss: 0.42852839827537537, rewards: -74, count: 74
success rate:  0.39
epoch: 198459, loss: 0.8371725082397461, rewards: -350, count: 296
success rate:  0.36
epoch: 198469, loss: 0.9423803091049194, rewards: -412, count: 376
success rate:  0.35
epoch: 198479, loss: 0.9482819437980652, rewards: -71, count: 71
success rate:  0.35
epoch: 198489, loss: 0.9552966952323914, rewards: -158, count: 131
success rate:  0.32
epoch: 198499, loss: 0.14306084

success rate:  0.3
epoch: 199349, loss: 1.427699327468872, rewards: -109, count: 91
success rate:  0.35
epoch: 199359, loss: 0.9365514516830444, rewards: -52, count: 52
success rate:  0.36
epoch: 199369, loss: 1.248602032661438, rewards: -331, count: 295
success rate:  0.34
epoch: 199379, loss: 1.5159388780593872, rewards: -215, count: 197
success rate:  0.36
epoch: 199389, loss: 0.5022119283676147, rewards: -479, count: 443
success rate:  0.35
epoch: 199399, loss: 0.8627467155456543, rewards: -628, count: 538
success rate:  0.36
epoch: 199409, loss: 0.3831506669521332, rewards: -148, count: 139
success rate:  0.37
epoch: 199419, loss: 0.6820620894432068, rewards: -830, count: 758
success rate:  0.39
epoch: 199429, loss: 1.131239414215088, rewards: -195, count: 177
success rate:  0.39
epoch: 199439, loss: 0.8942668437957764, rewards: -408, count: 381
success rate:  0.38
epoch: 199449, loss: 0.9034056067466736, rewards: -60, count: 60
success rate:  0.35
epoch: 199459, loss: 1.173500418

In [34]:
def sample_action(self,state):
    probs = self.policy_net(state) # 4
    if np.random.uniform() < 0.01:
        action = np.random.randint(0,4)
        return action, torch.log(probs[action]+1e-8).detach()
    dist = Categorical(probs)
    action = dist.sample()
    log_prob = dist.log_prob(action)
    return action.item(),log_prob.detach()

# 替换方法
import types
agent.sample_action = types.MethodType(sample_action, agent)

In [16]:
import time
def visualize_agent(env, agent, num_episodes=5):
    """
    渲染显示智能体的行动
    """
    env = gym.make('CliffWalking-v0', render_mode='human')  # 创建可视化环境
    
    for episode in range(num_episodes):
        state_tuple = env.reset()
        state = state_tuple[0] if isinstance(state_tuple, tuple) else state_tuple
        total_reward = 0
        steps = 0
        done = False
        
        print(f"\nEpisode {episode + 1}")
        
        while not done:
            env.render()  # 渲染当前状态
            
            # 将状态转换为one-hot编码
            state_onehot = np.zeros(48)
            state_onehot[state] = 1
            
            # 使用训练好的策略选择动作
            with torch.no_grad():
                if np.random.random() < 0.0:
                    action = np.random.randint(0, 4)
                else:
                    state_tensor = torch.FloatTensor(state_onehot)
                    probs = agent.policy_net(state_tensor)
                    action = probs.argmax().item()  # 使用最可能的动作
            
            # 执行动作
            step_result = env.step(action)
            if len(step_result) == 4:
                next_state, reward, done, _ = step_result
            else:
                next_state, reward, terminated, truncated, _ = step_result
                done = terminated or truncated
            
            total_reward += reward
            steps += 1
            state = next_state
            
            # 添加小延迟使动作更容易观察
            time.sleep(0.5)
        
        print(f"Episode finished after {steps} steps. Total reward: {total_reward}")
    
    env.close()

# 在主程序最后添加：
if __name__ == "__main__":    
    # 训练完成后显示智能体行动
    print("\nVisualizing trained agent behavior...")
    env = gym.make('CliffWalking-v0',render_mode='human')
    visualize_agent(env, agent)


Visualizing trained agent behavior...


2025-04-27 13:52:09.865 python[67964:170339742] +[IMKClient subclass]: chose IMKClient_Modern
2025-04-27 13:52:09.865 python[67964:170339742] +[IMKInputSession subclass]: chose IMKInputSession_Modern



Episode 1
Episode finished after 17 steps. Total reward: -17

Episode 2


KeyboardInterrupt: 

In [None]:
env.close()