In [79]:
import gym
import numpy as np
import gymnasium as gym 


In [80]:
import flappy_bird_gymnasium
import gymnasium as gym

# env = gym.make("FlappyBird-v0", render_mode="human", use_lidar=True)
env = gym.make("FlappyBird-v0",use_lidar=False)
obs, info = env.reset()




In [81]:
obs.shape

(12,)

In [82]:
env.action_space

Discrete(2)

In [83]:
env.reset()
state,_ = env.reset()
done = False
while not done:
    action = int(np.random.choice([0,1]))
    next_state, reward, done, truncated, _ = env.step(action)
    print(action,reward)

0 0.1
1 0.1
1 0.1
1 0.1
0 0.1
1 0.1
0 0.1
1 0.1
1 0.1
1 0.1
0 0.1
0 0.1
1 0.1
1 0.1
0 0.1
1 0.1
1 0.1
0 0.1
0 0.1
0 0.1
1 0.1
1 0.1
0 0.1
1 0.1
1 0.1
0 0.1
0 0.1
0 0.1
1 0.1
0 -0.5
1 -0.5
0 -0.5
0 -0.5
0 -0.5
0 -0.5
1 -0.5
1 -0.5
1 -0.5
0 -0.5
0 -0.5
1 -0.5
1 -0.5
0 -0.5
0 -0.5
0 -0.5
1 -0.5
0 -0.5
1 -0.5
1 -0.5
0 -1


In [84]:
from torch import nn
import torch

In [85]:
class PolicyNet(nn.Module):
    
    def __init__(self,input_dim,output_dim):
        super().__init__()
        self.linear1 = nn.Linear(input_dim,200)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(200,output_dim)
        self.softmax = nn.Softmax(dim=-1)
    
    def forward(self,state):
        ### n
        x = self.linear1(state)
        x = self.relu(x)
        x = self.linear2(x) # n
        x = self.softmax(x) # n
        return x

In [86]:
from torch.distributions import Categorical
import numpy as np
np.bool8 = np.bool_

from torch.optim import AdamW

In [87]:
class Agent:
    
    def __init__(self):
        self.policy_net = PolicyNet(12,2)
        self.optimizer = AdamW(self.policy_net.parameters(),lr=1e-3)
    
    def sample_action(self,state):
        probs = self.policy_net(state) # 4
        if np.random.uniform() < 0.0:
            action = np.random.randint(0,2)
            return action, torch.log(probs[action]+1e-8)
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(),log_prob
    
    def update(self,rewards,log_probs):
        ### 一次游戏时间
        ret = []
        adding = 0
        for r in rewards[::-1]:
            adding = adding * 0.99 + r
            ret.insert(0,adding)
        ret = torch.FloatTensor(ret)
        ret = ret - ret.mean()
        ret = ret / (ret.std()+1e-8)
        
        r_log_probs = []
        for r,log_prob in zip(ret,log_probs):
            r_log_probs.append(-r*log_prob)
        r_log_probs = torch.vstack(r_log_probs)
        
        loss = r_log_probs.sum()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss

In [88]:
def convert2tensor(state):
    state_arr = torch.FloatTensor(state)
    return state_arr

In [89]:
def train(agent,env):
    success_count = []
    max_size = 1000
    for epoch in range(20000000):
        rewards = []
        log_probs = []
        terminated = False
        success = True
        state,_ = env.reset()
        while not terminated and len(log_probs) < max_size:
            state_arr = convert2tensor(state)
            action, log_prob = agent.sample_action(state_arr)
            next_state, reward, terminated, truncated, _ = env.step(action)
            state = next_state
            rewards.append(reward)
            log_probs.append(log_prob)
        
        loss = agent.update(rewards,log_probs) 
        success_count.append(success)
        
        
        if (epoch+1) % 10 == 0:
            print(f'epoch: {epoch}, loss: {loss}, rewards: {sum(rewards)}, count: {len(rewards)}')

In [90]:
agent = Agent()



In [101]:
env = gym.make("FlappyBird-v0",use_lidar=False)
train(agent,env)

epoch: 9, loss: 0.2699802815914154, rewards: 8.399999999999986, count: 86
epoch: 19, loss: 2.7737550735473633, rewards: 13.499999999999972, count: 128
epoch: 29, loss: 0.7120810151100159, rewards: 23.20000000000007, count: 207
epoch: 39, loss: -1.4867277145385742, rewards: 18.200000000000014, count: 166
epoch: 49, loss: -0.3117551803588867, rewards: 22.700000000000063, count: 202
epoch: 59, loss: -1.4446250200271606, rewards: 8.399999999999986, count: 86
epoch: 69, loss: -1.7559071779251099, rewards: 17.90000000000001, count: 163
epoch: 79, loss: 2.75830078125, rewards: 22.40000000000006, count: 199
epoch: 89, loss: 0.06955179572105408, rewards: 17.90000000000001, count: 163
epoch: 99, loss: -1.6416422128677368, rewards: 27.200000000000113, count: 238
epoch: 109, loss: -2.8319194316864014, rewards: 22.900000000000066, count: 204
epoch: 119, loss: -0.7819856405258179, rewards: 3.8999999999999986, count: 50
epoch: 129, loss: 0.34236666560173035, rewards: 17.90000000000001, count: 163
epo

epoch: 1079, loss: -1.2368907928466797, rewards: 13.499999999999972, count: 128
epoch: 1089, loss: -0.5841783285140991, rewards: 9.399999999999983, count: 96
epoch: 1099, loss: -0.9616409540176392, rewards: 18.300000000000015, count: 167
epoch: 1109, loss: -8.192109107971191, rewards: 25.00000000000008, count: 216
epoch: 1119, loss: -1.6525373458862305, rewards: 18.00000000000001, count: 164
epoch: 1129, loss: 4.068902969360352, rewards: 12.899999999999974, count: 122
epoch: 1139, loss: 2.729628801345825, rewards: 34.70000000000019, count: 295
epoch: 1149, loss: -1.9424107074737549, rewards: 8.699999999999985, count: 89
epoch: 1159, loss: -2.639286518096924, rewards: 8.399999999999986, count: 86
epoch: 1169, loss: -0.5340666174888611, rewards: 13.099999999999973, count: 124
epoch: 1179, loss: -4.05173397064209, rewards: 46.10000000000032, count: 391
epoch: 1189, loss: 0.5795187950134277, rewards: 22.50000000000006, count: 200
epoch: 1199, loss: -1.9965726137161255, rewards: 22.40000000

epoch: 2129, loss: -1.0469598770141602, rewards: 13.199999999999973, count: 125
epoch: 2139, loss: -0.6031270027160645, rewards: 8.399999999999986, count: 86
epoch: 2149, loss: -5.799953460693359, rewards: 8.999999999999984, count: 92
epoch: 2159, loss: -10.1764497756958, rewards: 4.1999999999999975, count: 53
epoch: 2169, loss: -0.3321964144706726, rewards: 9.399999999999983, count: 96
epoch: 2179, loss: -1.1131876707077026, rewards: 18.400000000000016, count: 168
epoch: 2189, loss: 1.068885087966919, rewards: 18.400000000000016, count: 168
epoch: 2199, loss: 0.1368669718503952, rewards: 8.399999999999986, count: 86
epoch: 2209, loss: -1.389510154724121, rewards: 8.399999999999986, count: 86
epoch: 2219, loss: -1.0948586463928223, rewards: 8.699999999999985, count: 89
epoch: 2229, loss: -1.059458613395691, rewards: 8.599999999999985, count: 88
epoch: 2239, loss: 5.390729904174805, rewards: 13.79999999999997, count: 131
epoch: 2249, loss: -1.4755992889404297, rewards: 8.399999999999986

epoch: 3179, loss: -8.46863079071045, rewards: 32.30000000000017, count: 280
epoch: 3189, loss: -0.0033131837844848633, rewards: 12.999999999999973, count: 123
epoch: 3199, loss: 0.9787770509719849, rewards: 13.099999999999973, count: 124
epoch: 3209, loss: -1.3425593376159668, rewards: 74.99999999999993, count: 626
epoch: 3219, loss: -4.670026779174805, rewards: 40.90000000000026, count: 348
epoch: 3229, loss: -4.580491542816162, rewards: 41.30000000000027, count: 352
epoch: 3239, loss: 0.07786056399345398, rewards: 8.399999999999986, count: 86
epoch: 3249, loss: -0.4964662492275238, rewards: 20.500000000000032, count: 180
epoch: 3259, loss: 0.0056766243651509285, rewards: 6.5999999999999925, count: 68
epoch: 3269, loss: 1.207205891609192, rewards: 6.799999999999992, count: 70
epoch: 3279, loss: 0.7054789066314697, rewards: 8.899999999999984, count: 91
epoch: 3289, loss: 2.2515177726745605, rewards: 22.40000000000006, count: 199
epoch: 3299, loss: 0.265078067779541, rewards: 8.3999999

epoch: 4229, loss: -1.1080031394958496, rewards: 9.099999999999984, count: 93
epoch: 4239, loss: 0.44256412982940674, rewards: 8.999999999999984, count: 92
epoch: 4249, loss: 5.0772199630737305, rewards: 8.399999999999986, count: 86
epoch: 4259, loss: -0.4431978166103363, rewards: 3.8999999999999986, count: 50
epoch: 4269, loss: 0.843498706817627, rewards: 12.899999999999974, count: 122
epoch: 4279, loss: -2.1530447006225586, rewards: 12.899999999999974, count: 122
epoch: 4289, loss: 0.6444000005722046, rewards: 6.5999999999999925, count: 68
epoch: 4299, loss: -2.5080888271331787, rewards: 13.599999999999971, count: 129
epoch: 4309, loss: 0.41641509532928467, rewards: 8.399999999999986, count: 86
epoch: 4319, loss: -2.1738357543945312, rewards: 22.600000000000062, count: 201
epoch: 4329, loss: 0.5049083232879639, rewards: 12.899999999999974, count: 122
epoch: 4339, loss: -1.2299829721450806, rewards: 18.500000000000018, count: 169
epoch: 4349, loss: 2.7018136978149414, rewards: 8.39999

epoch: 5279, loss: 2.506747245788574, rewards: 6.699999999999992, count: 69
epoch: 5289, loss: 0.2804063558578491, rewards: 27.900000000000123, count: 245
epoch: 5299, loss: 0.8202221393585205, rewards: 8.399999999999986, count: 86
epoch: 5309, loss: -3.8219900131225586, rewards: 8.499999999999986, count: 87
epoch: 5319, loss: -0.7130427956581116, rewards: 13.499999999999972, count: 128
epoch: 5329, loss: 1.0851186513900757, rewards: 36.40000000000021, count: 312
epoch: 5339, loss: -3.379080057144165, rewards: 65.20000000000037, count: 546
epoch: 5349, loss: -2.8585829734802246, rewards: 8.599999999999985, count: 88
epoch: 5359, loss: 2.682302713394165, rewards: 26.90000000000011, count: 235
epoch: 5369, loss: 0.213509202003479, rewards: 8.399999999999986, count: 86
epoch: 5379, loss: -1.8385975360870361, rewards: 8.399999999999986, count: 86
epoch: 5389, loss: 1.0536761283874512, rewards: 8.399999999999986, count: 86
epoch: 5399, loss: -0.5269852876663208, rewards: 8.399999999999986, 

epoch: 6329, loss: 1.6827921867370605, rewards: 22.40000000000006, count: 199
epoch: 6339, loss: 0.4293331205844879, rewards: 8.399999999999986, count: 86
epoch: 6349, loss: 3.062600612640381, rewards: 13.099999999999973, count: 124
epoch: 6359, loss: -1.167496681213379, rewards: 8.399999999999986, count: 86
epoch: 6369, loss: -0.47606539726257324, rewards: 8.399999999999986, count: 86
epoch: 6379, loss: 0.4057508707046509, rewards: 22.50000000000006, count: 200
epoch: 6389, loss: -0.6402198076248169, rewards: 15.79999999999998, count: 142
epoch: 6399, loss: -3.1746420860290527, rewards: 14.99999999999997, count: 134
epoch: 6409, loss: 0.7001944184303284, rewards: 40.90000000000026, count: 348
epoch: 6419, loss: -6.337176322937012, rewards: 27.70000000000012, count: 243
epoch: 6429, loss: 2.858246088027954, rewards: 22.40000000000006, count: 199
epoch: 6439, loss: 2.1084532737731934, rewards: 6.5999999999999925, count: 68
epoch: 6449, loss: 0.021487116813659668, rewards: 8.399999999999

epoch: 7379, loss: -2.163482666015625, rewards: 18.400000000000016, count: 168
epoch: 7389, loss: -1.0122963190078735, rewards: 23.30000000000007, count: 208
epoch: 7399, loss: -1.4565404653549194, rewards: 8.399999999999986, count: 86
epoch: 7409, loss: -1.5134950876235962, rewards: 8.399999999999986, count: 86
epoch: 7419, loss: -7.241347312927246, rewards: 32.50000000000017, count: 282
epoch: 7429, loss: 0.1400040090084076, rewards: 13.099999999999973, count: 124
epoch: 7439, loss: -2.125335693359375, rewards: 8.399999999999986, count: 86
epoch: 7449, loss: -11.675012588500977, rewards: 6.799999999999992, count: 70
epoch: 7459, loss: -0.3426772952079773, rewards: 15.79999999999998, count: 142
epoch: 7469, loss: 0.34289973974227905, rewards: 8.399999999999986, count: 86
epoch: 7479, loss: -3.4032678604125977, rewards: 17.90000000000001, count: 163
epoch: 7489, loss: -0.31650036573410034, rewards: 22.40000000000006, count: 199
epoch: 7499, loss: -0.8035979270935059, rewards: 12.899999

epoch: 8429, loss: -0.958789587020874, rewards: 9.099999999999984, count: 93
epoch: 8439, loss: -1.2898445129394531, rewards: 34.500000000000185, count: 293
epoch: 8449, loss: -1.3031383752822876, rewards: 12.899999999999974, count: 122
epoch: 8459, loss: -1.0528223514556885, rewards: 8.399999999999986, count: 86
epoch: 8469, loss: -4.756318092346191, rewards: 22.40000000000006, count: 199
epoch: 8479, loss: -5.152090072631836, rewards: 20.500000000000032, count: 180
epoch: 8489, loss: 4.301951885223389, rewards: 42.00000000000028, count: 359
epoch: 8499, loss: -2.3385095596313477, rewards: 54.90000000000042, count: 461
epoch: 8509, loss: -5.555517196655273, rewards: 17.90000000000001, count: 163
epoch: 8519, loss: 0.5003859400749207, rewards: 8.399999999999986, count: 86
epoch: 8529, loss: 1.6694811582565308, rewards: 8.399999999999986, count: 86
epoch: 8539, loss: -6.204046249389648, rewards: 48.800000000000345, count: 409
epoch: 8549, loss: 0.8586528897285461, rewards: 8.59999999999

epoch: 9479, loss: 1.236729383468628, rewards: 8.399999999999986, count: 86
epoch: 9489, loss: -1.833242654800415, rewards: 12.999999999999973, count: 123
epoch: 9499, loss: -4.677593231201172, rewards: 67.30000000000031, count: 558
epoch: 9509, loss: -1.8843095302581787, rewards: 8.399999999999986, count: 86
epoch: 9519, loss: -8.494834899902344, rewards: 20.800000000000036, count: 183
epoch: 9529, loss: 3.92692232131958, rewards: 22.40000000000006, count: 199
epoch: 9539, loss: -6.692834854125977, rewards: 11.29999999999998, count: 106
epoch: 9549, loss: -1.0264177322387695, rewards: 17.90000000000001, count: 163
epoch: 9559, loss: 0.15128964185714722, rewards: 13.099999999999973, count: 124
epoch: 9569, loss: -3.3563482761383057, rewards: 23.000000000000068, count: 205
epoch: 9579, loss: -3.2114953994750977, rewards: 22.40000000000006, count: 199
epoch: 9589, loss: -3.1350812911987305, rewards: 22.600000000000062, count: 201
epoch: 9599, loss: -1.7796467542648315, rewards: 11.199999

epoch: 10529, loss: -0.8814147710800171, rewards: 9.299999999999983, count: 95
epoch: 10539, loss: -3.653989791870117, rewards: 36.40000000000021, count: 312
epoch: 10549, loss: 2.086097002029419, rewards: 13.89999999999997, count: 132
epoch: 10559, loss: 0.13301301002502441, rewards: 8.399999999999986, count: 86
epoch: 10569, loss: 0.2601528465747833, rewards: 4.1999999999999975, count: 53
epoch: 10579, loss: -0.4661083519458771, rewards: 12.899999999999974, count: 122
epoch: 10589, loss: -1.0396699905395508, rewards: 8.499999999999986, count: 87
epoch: 10599, loss: -2.228294849395752, rewards: 13.199999999999973, count: 125
epoch: 10609, loss: -0.41012951731681824, rewards: 36.40000000000021, count: 312
epoch: 10619, loss: -3.6253156661987305, rewards: 27.60000000000012, count: 242
epoch: 10629, loss: 0.08340892940759659, rewards: 8.599999999999985, count: 88
epoch: 10639, loss: 0.29373499751091003, rewards: 22.50000000000006, count: 200
epoch: 10649, loss: 1.3839982748031616, reward

epoch: 11569, loss: 2.5654408931732178, rewards: 8.499999999999986, count: 87
epoch: 11579, loss: -0.6749405860900879, rewards: 8.399999999999986, count: 86
epoch: 11589, loss: 0.860650897026062, rewards: 64.40000000000042, count: 538
epoch: 11599, loss: 2.094043493270874, rewards: 26.90000000000011, count: 235
epoch: 11609, loss: 4.854959011077881, rewards: 9.099999999999984, count: 93
epoch: 11619, loss: 0.2699664235115051, rewards: 12.999999999999973, count: 123
epoch: 11629, loss: -1.394642949104309, rewards: 20.700000000000035, count: 182
epoch: 11639, loss: -1.5392720699310303, rewards: 17.90000000000001, count: 163
epoch: 11649, loss: 0.2844504415988922, rewards: 8.399999999999986, count: 86
epoch: 11659, loss: 3.0126943588256836, rewards: 12.899999999999974, count: 122
epoch: 11669, loss: -0.37426888942718506, rewards: 6.699999999999992, count: 69
epoch: 11679, loss: -0.33780109882354736, rewards: 32.900000000000176, count: 286
epoch: 11689, loss: 1.2813835144042969, rewards: 2

epoch: 12609, loss: -5.600581645965576, rewards: 11.29999999999998, count: 106
epoch: 12619, loss: 0.709956705570221, rewards: 8.399999999999986, count: 86
epoch: 12629, loss: -0.1751483678817749, rewards: 22.40000000000006, count: 199
epoch: 12639, loss: 0.3475102186203003, rewards: 12.899999999999974, count: 122
epoch: 12649, loss: 1.6817702054977417, rewards: 12.899999999999974, count: 122
epoch: 12659, loss: 5.323209285736084, rewards: 37.00000000000022, count: 318
epoch: 12669, loss: 0.8716027140617371, rewards: 22.40000000000006, count: 199
epoch: 12679, loss: -0.5145681500434875, rewards: 8.399999999999986, count: 86
epoch: 12689, loss: 7.070563316345215, rewards: 12.899999999999974, count: 122
epoch: 12699, loss: 1.0719438791275024, rewards: 6.799999999999992, count: 70
epoch: 12709, loss: -0.08533214777708054, rewards: 6.499999999999993, count: 67
epoch: 12719, loss: 1.0432268381118774, rewards: 6.799999999999992, count: 70
epoch: 12729, loss: 2.328873872756958, rewards: 8.399

epoch: 13649, loss: -0.05722832679748535, rewards: 8.399999999999986, count: 86
epoch: 13659, loss: 1.7032276391983032, rewards: 40.90000000000026, count: 348
epoch: 13669, loss: 0.9552003741264343, rewards: 8.399999999999986, count: 86
epoch: 13679, loss: -1.0334490537643433, rewards: 8.399999999999986, count: 86
epoch: 13689, loss: -0.28945761919021606, rewards: 8.699999999999985, count: 89
epoch: 13699, loss: -4.259769439697266, rewards: 3.8999999999999986, count: 50
epoch: 13709, loss: 0.037727952003479004, rewards: 4.1999999999999975, count: 53
epoch: 13719, loss: -0.7770285606384277, rewards: 18.200000000000014, count: 166
epoch: 13729, loss: 0.28910335898399353, rewards: 8.399999999999986, count: 86
epoch: 13739, loss: -3.6308743953704834, rewards: 8.499999999999986, count: 87
epoch: 13749, loss: 1.752488374710083, rewards: 8.399999999999986, count: 86
epoch: 13759, loss: -2.7174086570739746, rewards: 27.00000000000011, count: 236
epoch: 13769, loss: -0.6509973406791687, rewards

epoch: 14689, loss: 1.9822423458099365, rewards: 20.800000000000036, count: 183
epoch: 14699, loss: -5.509544849395752, rewards: 60.700000000000486, count: 510
epoch: 14709, loss: -0.9395191669464111, rewards: 22.40000000000006, count: 199
epoch: 14719, loss: 1.561128854751587, rewards: 12.899999999999974, count: 122
epoch: 14729, loss: -0.2888183891773224, rewards: 8.499999999999986, count: 87
epoch: 14739, loss: -0.04011034965515137, rewards: 11.19999999999998, count: 105
epoch: 14749, loss: 1.949681043624878, rewards: 8.399999999999986, count: 86
epoch: 14759, loss: -1.923681616783142, rewards: 8.599999999999985, count: 88
epoch: 14769, loss: 0.7074611783027649, rewards: 8.599999999999985, count: 88
epoch: 14779, loss: 1.2082173824310303, rewards: 22.40000000000006, count: 199
epoch: 14789, loss: 3.223182439804077, rewards: 26.90000000000011, count: 235
epoch: 14799, loss: 0.7063115835189819, rewards: 8.399999999999986, count: 86
epoch: 14809, loss: 0.23802529275417328, rewards: 13.

epoch: 15739, loss: 0.5891904830932617, rewards: 6.299999999999994, count: 65
epoch: 15749, loss: 1.1038155555725098, rewards: 4.799999999999995, count: 59
epoch: 15759, loss: -0.033953145146369934, rewards: 8.399999999999986, count: 86
epoch: 15769, loss: 2.3410191535949707, rewards: 40.90000000000026, count: 348
epoch: 15779, loss: 0.6806114315986633, rewards: 8.399999999999986, count: 86
epoch: 15789, loss: 2.165644645690918, rewards: 12.899999999999974, count: 122
epoch: 15799, loss: 0.6758676171302795, rewards: 36.40000000000021, count: 312
epoch: 15809, loss: 1.4682742357254028, rewards: 8.399999999999986, count: 86
epoch: 15819, loss: 0.028800301253795624, rewards: 13.79999999999997, count: 131
epoch: 15829, loss: -4.846429347991943, rewards: 41.40000000000027, count: 353
epoch: 15839, loss: 2.1860604286193848, rewards: 36.40000000000021, count: 312
epoch: 15849, loss: 1.586754322052002, rewards: 122.49999999999785, count: 1000
epoch: 15859, loss: -4.510165214538574, rewards: 26

epoch: 16779, loss: 3.9880430698394775, rewards: 92.39999999999917, count: 764
epoch: 16789, loss: 0.8480566143989563, rewards: 11.09999999999998, count: 104
epoch: 16799, loss: 0.7530653476715088, rewards: 34.70000000000019, count: 295
epoch: 16809, loss: -0.5239214897155762, rewards: 13.099999999999973, count: 124
epoch: 16819, loss: 1.2531042098999023, rewards: 26.90000000000011, count: 235
epoch: 16829, loss: 3.0016722679138184, rewards: 37.50000000000023, count: 323
epoch: 16839, loss: 1.1409378051757812, rewards: 22.900000000000066, count: 204
epoch: 16849, loss: 0.12254592776298523, rewards: 22.40000000000006, count: 199
epoch: 16859, loss: -1.3975298404693604, rewards: 50.40000000000037, count: 425
epoch: 16869, loss: 4.265402317047119, rewards: 64.40000000000042, count: 538
epoch: 16879, loss: -1.5909831523895264, rewards: 107.29999999999849, count: 886
epoch: 16889, loss: 3.0139737129211426, rewards: 22.40000000000006, count: 199
epoch: 16899, loss: -4.577594757080078, reward

epoch: 17819, loss: -0.4504358172416687, rewards: 25.200000000000085, count: 218
epoch: 17829, loss: 1.8263657093048096, rewards: 22.40000000000006, count: 199
epoch: 17839, loss: -0.11448037624359131, rewards: 36.40000000000021, count: 312
epoch: 17849, loss: 0.07292751967906952, rewards: 8.399999999999986, count: 86
epoch: 17859, loss: 1.209091067314148, rewards: 36.40000000000021, count: 312
epoch: 17869, loss: -7.826637268066406, rewards: 3.8999999999999986, count: 50
epoch: 17879, loss: 1.9631774425506592, rewards: 12.899999999999974, count: 122
epoch: 17889, loss: 3.8545777797698975, rewards: 40.90000000000026, count: 348
epoch: 17899, loss: -4.581961631774902, rewards: 51.20000000000038, count: 433
epoch: 17909, loss: 1.4174748659133911, rewards: 12.899999999999974, count: 122
epoch: 17919, loss: -8.172852516174316, rewards: 23.500000000000075, count: 210
epoch: 17929, loss: 0.8215234875679016, rewards: 12.899999999999974, count: 122
epoch: 17939, loss: -0.3676386773586273, rewa

epoch: 18859, loss: 1.4255728721618652, rewards: 22.40000000000006, count: 199
epoch: 18869, loss: -0.7913568615913391, rewards: 9.099999999999984, count: 93
epoch: 18879, loss: 0.6936826109886169, rewards: 22.40000000000006, count: 199
epoch: 18889, loss: -5.023604393005371, rewards: 36.50000000000021, count: 313
epoch: 18899, loss: -3.058396339416504, rewards: 48.800000000000345, count: 409
epoch: 18909, loss: -0.9677417874336243, rewards: 9.399999999999983, count: 96
epoch: 18919, loss: -1.634666085243225, rewards: 12.899999999999974, count: 122
epoch: 18929, loss: 0.34759023785591125, rewards: 12.899999999999974, count: 122
epoch: 18939, loss: 3.926891326904297, rewards: 8.399999999999986, count: 86
epoch: 18949, loss: 1.3577767610549927, rewards: 69.00000000000021, count: 575
epoch: 18959, loss: -3.0367648601531982, rewards: 36.40000000000021, count: 312
epoch: 18969, loss: -0.18025386333465576, rewards: 8.399999999999986, count: 86
epoch: 18979, loss: 0.686591386795044, rewards: 

epoch: 19899, loss: -0.42814579606056213, rewards: 4.599999999999996, count: 57
epoch: 19909, loss: -1.4167088270187378, rewards: 36.90000000000022, count: 317
epoch: 19919, loss: -2.6570968627929688, rewards: 36.40000000000021, count: 312
epoch: 19929, loss: -4.597011566162109, rewards: 22.40000000000006, count: 199
epoch: 19939, loss: -3.2174036502838135, rewards: 8.399999999999986, count: 86
epoch: 19949, loss: -1.9170317649841309, rewards: 8.899999999999984, count: 91
epoch: 19959, loss: 0.048769235610961914, rewards: 6.199999999999994, count: 64
epoch: 19969, loss: 0.3532014787197113, rewards: 3.8999999999999986, count: 50
epoch: 19979, loss: 0.22005875408649445, rewards: 4.399999999999997, count: 55
epoch: 19989, loss: -0.5053802132606506, rewards: 4.399999999999997, count: 55
epoch: 19999, loss: -0.5699442028999329, rewards: 4.1999999999999975, count: 53
epoch: 20009, loss: 0.22173286974430084, rewards: 8.399999999999986, count: 86
epoch: 20019, loss: -5.3153815269470215, reward

epoch: 20939, loss: 1.8618543148040771, rewards: 36.40000000000021, count: 312
epoch: 20949, loss: 0.5180067420005798, rewards: 13.099999999999973, count: 124
epoch: 20959, loss: -1.7018556594848633, rewards: 26.90000000000011, count: 235
epoch: 20969, loss: -2.787135124206543, rewards: 8.399999999999986, count: 86
epoch: 20979, loss: -7.984166622161865, rewards: 18.00000000000001, count: 164
epoch: 20989, loss: 3.755082845687866, rewards: 22.40000000000006, count: 199
epoch: 20999, loss: -0.264927476644516, rewards: 4.699999999999996, count: 58
epoch: 21009, loss: -12.369701385498047, rewards: 22.40000000000006, count: 199
epoch: 21019, loss: -0.2943025827407837, rewards: 8.799999999999985, count: 90
epoch: 21029, loss: -0.037929993122816086, rewards: 8.399999999999986, count: 86
epoch: 21039, loss: -3.0766983032226562, rewards: 27.10000000000011, count: 237
epoch: 21049, loss: 0.9822655916213989, rewards: 12.899999999999974, count: 122
epoch: 21059, loss: 0.8454345464706421, rewards:

epoch: 21979, loss: -0.4406980276107788, rewards: 4.1999999999999975, count: 53
epoch: 21989, loss: 1.7088067531585693, rewards: 3.8999999999999986, count: 50
epoch: 21999, loss: -1.565285325050354, rewards: -2.099999999999998, count: 50
epoch: 22009, loss: -1.1955865621566772, rewards: 3.8999999999999986, count: 50
epoch: 22019, loss: 0.12847274541854858, rewards: 3.8999999999999986, count: 50
epoch: 22029, loss: 0.13270233571529388, rewards: 3.8999999999999986, count: 50
epoch: 22039, loss: -1.1441617012023926, rewards: 3.8999999999999986, count: 50
epoch: 22049, loss: 0.07630098611116409, rewards: 3.8999999999999986, count: 50
epoch: 22059, loss: 0.1516071856021881, rewards: 3.8999999999999986, count: 50
epoch: 22069, loss: -2.986497402191162, rewards: 3.8999999999999986, count: 50
epoch: 22079, loss: -0.7023754119873047, rewards: 3.8999999999999986, count: 50
epoch: 22089, loss: -3.029719829559326, rewards: 4.299999999999997, count: 54
epoch: 22099, loss: -1.640188217163086, reward

epoch: 23019, loss: -0.3471868634223938, rewards: 22.700000000000063, count: 202
epoch: 23029, loss: 6.299147129058838, rewards: 8.399999999999986, count: 86
epoch: 23039, loss: 1.2901718616485596, rewards: 3.8999999999999986, count: 50
epoch: 23049, loss: 3.8844058513641357, rewards: 3.8999999999999986, count: 50
epoch: 23059, loss: 3.000485420227051, rewards: 26.90000000000011, count: 235
epoch: 23069, loss: 0.03983020782470703, rewards: 12.899999999999974, count: 122
epoch: 23079, loss: 3.2699384689331055, rewards: 8.399999999999986, count: 86
epoch: 23089, loss: 2.7578625679016113, rewards: 36.40000000000021, count: 312
epoch: 23099, loss: -2.062021493911743, rewards: 6.5999999999999925, count: 68
epoch: 23109, loss: 1.8262791633605957, rewards: 8.399999999999986, count: 86
epoch: 23119, loss: 2.056837797164917, rewards: 8.399999999999986, count: 86
epoch: 23129, loss: 0.9237529039382935, rewards: 8.399999999999986, count: 86
epoch: 23139, loss: 0.6217354536056519, rewards: 54.9000

epoch: 24059, loss: -0.27900370955467224, rewards: 4.1999999999999975, count: 53
epoch: 24069, loss: 0.9693254232406616, rewards: 8.399999999999986, count: 86
epoch: 24079, loss: 6.149330139160156, rewards: 96.89999999999897, count: 800
epoch: 24089, loss: -2.875717878341675, rewards: 12.899999999999974, count: 122
epoch: 24099, loss: 3.1935176849365234, rewards: 8.399999999999986, count: 86
epoch: 24109, loss: -0.7015696167945862, rewards: 9.199999999999983, count: 94
epoch: 24119, loss: 1.719582200050354, rewards: 3.8999999999999986, count: 50
epoch: 24129, loss: 0.17318344116210938, rewards: 8.399999999999986, count: 86
epoch: 24139, loss: -10.246864318847656, rewards: 22.50000000000006, count: 200
epoch: 24149, loss: 0.39331769943237305, rewards: 3.8999999999999986, count: 50
epoch: 24159, loss: 0.15805977582931519, rewards: 3.8999999999999986, count: 50
epoch: 24169, loss: -1.2519159317016602, rewards: 4.299999999999997, count: 54
epoch: 24179, loss: 1.5438165664672852, rewards: 1

epoch: 25109, loss: 2.2820539474487305, rewards: 3.8999999999999986, count: 50
epoch: 25119, loss: 4.999239444732666, rewards: 4.099999999999998, count: 52
epoch: 25129, loss: 2.592495918273926, rewards: 3.8999999999999986, count: 50
epoch: 25139, loss: 0.44300833344459534, rewards: 3.8999999999999986, count: 50
epoch: 25149, loss: 1.793505311012268, rewards: 3.8999999999999986, count: 50
epoch: 25159, loss: 2.2718496322631836, rewards: 4.099999999999998, count: 52
epoch: 25169, loss: 1.4327555894851685, rewards: 3.8999999999999986, count: 50
epoch: 25179, loss: 5.054328918457031, rewards: 3.9999999999999982, count: 51
epoch: 25189, loss: 4.115429878234863, rewards: 3.8999999999999986, count: 50
epoch: 25199, loss: -0.6713250279426575, rewards: 3.8999999999999986, count: 50
epoch: 25209, loss: 0.5675594806671143, rewards: 3.9999999999999982, count: 51
epoch: 25219, loss: 0.24592819809913635, rewards: 4.099999999999998, count: 52
epoch: 25229, loss: -0.5115448832511902, rewards: 4.19999

epoch: 26159, loss: 2.2943787574768066, rewards: 13.099999999999973, count: 124
epoch: 26169, loss: 2.431253433227539, rewards: 8.799999999999985, count: 90
epoch: 26179, loss: -0.21336841583251953, rewards: 27.200000000000113, count: 238
epoch: 26189, loss: 0.40945547819137573, rewards: 12.899999999999974, count: 122
epoch: 26199, loss: -3.291358232498169, rewards: 4.999999999999995, count: 61
epoch: 26209, loss: 0.5855323076248169, rewards: 3.8999999999999986, count: 50
epoch: 26219, loss: 0.16931721568107605, rewards: 12.899999999999974, count: 122
epoch: 26229, loss: -2.2068278789520264, rewards: 18.400000000000016, count: 168
epoch: 26239, loss: 0.4138168692588806, rewards: 22.40000000000006, count: 199
epoch: 26249, loss: 3.860595226287842, rewards: 40.90000000000026, count: 348
epoch: 26259, loss: 0.1448466032743454, rewards: 4.399999999999997, count: 55
epoch: 26269, loss: 1.1919018030166626, rewards: 4.599999999999996, count: 57
epoch: 26279, loss: -3.0475833415985107, rewards

epoch: 27199, loss: 0.7829976081848145, rewards: 36.40000000000021, count: 312
epoch: 27209, loss: 8.709611892700195, rewards: 122.49999999999785, count: 1000
epoch: 27219, loss: 1.2086904048919678, rewards: 8.399999999999986, count: 86
epoch: 27229, loss: -0.2248317003250122, rewards: 4.299999999999997, count: 54
epoch: 27239, loss: 0.8496409058570862, rewards: 8.399999999999986, count: 86
epoch: 27249, loss: 4.790909767150879, rewards: 40.90000000000026, count: 348
epoch: 27259, loss: -1.470710277557373, rewards: 8.399999999999986, count: 86
epoch: 27269, loss: -0.9279530644416809, rewards: 4.599999999999996, count: 57
epoch: 27279, loss: 1.3605314493179321, rewards: 8.399999999999986, count: 86
epoch: 27289, loss: -4.736769676208496, rewards: 26.90000000000011, count: 235
epoch: 27299, loss: 1.03859543800354, rewards: 26.90000000000011, count: 235
epoch: 27309, loss: -2.8506033420562744, rewards: 27.400000000000116, count: 240
epoch: 27319, loss: 2.426842451095581, rewards: 12.89999

epoch: 28239, loss: 0.05792151018977165, rewards: 3.8999999999999986, count: 50
epoch: 28249, loss: -4.00729513168335, rewards: 3.8999999999999986, count: 50
epoch: 28259, loss: 0.3667527437210083, rewards: 4.099999999999998, count: 52
epoch: 28269, loss: 1.563647747039795, rewards: 3.8999999999999986, count: 50
epoch: 28279, loss: 1.429490327835083, rewards: 3.8999999999999986, count: 50
epoch: 28289, loss: 6.888702392578125, rewards: 4.699999999999996, count: 58
epoch: 28299, loss: 1.3913614749908447, rewards: 3.8999999999999986, count: 50
epoch: 28309, loss: -1.6839134693145752, rewards: 13.299999999999972, count: 126
epoch: 28319, loss: 2.2559499740600586, rewards: 17.90000000000001, count: 163
epoch: 28329, loss: 4.188821792602539, rewards: 8.399999999999986, count: 86
epoch: 28339, loss: 1.7916874885559082, rewards: 12.899999999999974, count: 122
epoch: 28349, loss: 1.7622992992401123, rewards: 8.399999999999986, count: 86
epoch: 28359, loss: 2.099348783493042, rewards: 36.400000

epoch: 29279, loss: 5.498843669891357, rewards: 4.1999999999999975, count: 53
epoch: 29289, loss: -0.6101024150848389, rewards: 8.899999999999984, count: 91
epoch: 29299, loss: 0.31571242213249207, rewards: 22.40000000000006, count: 199
epoch: 29309, loss: 1.9218653440475464, rewards: 12.899999999999974, count: 122
epoch: 29319, loss: 2.8315320014953613, rewards: 64.40000000000042, count: 538
epoch: 29329, loss: 1.2903001308441162, rewards: 8.399999999999986, count: 86
epoch: 29339, loss: 3.588684320449829, rewards: 22.40000000000006, count: 199
epoch: 29349, loss: 0.17778410017490387, rewards: 8.399999999999986, count: 86
epoch: 29359, loss: -1.1006754636764526, rewards: 6.699999999999992, count: 69
epoch: 29369, loss: 0.8950291872024536, rewards: 4.099999999999998, count: 52
epoch: 29379, loss: 3.2315542697906494, rewards: 6.799999999999992, count: 70
epoch: 29389, loss: 0.9428470730781555, rewards: 27.500000000000117, count: 241
epoch: 29399, loss: -3.5546963214874268, rewards: 22.4

epoch: 30319, loss: 4.815852165222168, rewards: 12.899999999999974, count: 122
epoch: 30329, loss: 1.8198425769805908, rewards: 26.90000000000011, count: 235
epoch: 30339, loss: 1.5525532960891724, rewards: 17.90000000000001, count: 163
epoch: 30349, loss: 0.9942556023597717, rewards: 12.899999999999974, count: 122
epoch: 30359, loss: -1.037724494934082, rewards: 8.399999999999986, count: 86
epoch: 30369, loss: -13.770437240600586, rewards: 27.70000000000012, count: 243
epoch: 30379, loss: -0.9691389799118042, rewards: 22.40000000000006, count: 199
epoch: 30389, loss: 1.944625735282898, rewards: 12.899999999999974, count: 122
epoch: 30399, loss: 5.6511149406433105, rewards: 8.399999999999986, count: 86
epoch: 30409, loss: 3.0264554023742676, rewards: 8.399999999999986, count: 86
epoch: 30419, loss: -1.0362828969955444, rewards: 22.40000000000006, count: 199
epoch: 30429, loss: -1.2931571006774902, rewards: 12.899999999999974, count: 122
epoch: 30439, loss: 6.726395130157471, rewards: 3

epoch: 31359, loss: -1.5571752786636353, rewards: 22.40000000000006, count: 199
epoch: 31369, loss: -0.5329484939575195, rewards: 8.399999999999986, count: 86
epoch: 31379, loss: 2.487943172454834, rewards: 40.90000000000026, count: 348
epoch: 31389, loss: 8.002591133117676, rewards: 12.899999999999974, count: 122
epoch: 31399, loss: 3.267364978790283, rewards: 8.399999999999986, count: 86
epoch: 31409, loss: 1.0708022117614746, rewards: 37.10000000000022, count: 319
epoch: 31419, loss: 3.5042977333068848, rewards: 17.90000000000001, count: 163
epoch: 31429, loss: -2.0519626140594482, rewards: 27.00000000000011, count: 236
epoch: 31439, loss: 3.1968560218811035, rewards: 22.40000000000006, count: 199
epoch: 31449, loss: 1.473719835281372, rewards: 13.399999999999972, count: 127
epoch: 31459, loss: -1.3714194297790527, rewards: 4.299999999999997, count: 54
epoch: 31469, loss: 2.968625545501709, rewards: 8.399999999999986, count: 86
epoch: 31479, loss: 2.4669675827026367, rewards: 8.3999

epoch: 32409, loss: 0.6654038429260254, rewards: 8.399999999999986, count: 86
epoch: 32419, loss: 0.19512075185775757, rewards: 9.399999999999983, count: 96
epoch: 32429, loss: 0.4008731245994568, rewards: 8.399999999999986, count: 86
epoch: 32439, loss: 1.3868907690048218, rewards: 18.900000000000023, count: 173
epoch: 32449, loss: 4.375234603881836, rewards: 12.899999999999974, count: 122
epoch: 32459, loss: 1.3285531997680664, rewards: 12.899999999999974, count: 122
epoch: 32469, loss: 1.6118059158325195, rewards: 8.399999999999986, count: 86
epoch: 32479, loss: 4.460552215576172, rewards: 8.399999999999986, count: 86
epoch: 32489, loss: -1.2922507524490356, rewards: 8.399999999999986, count: 86
epoch: 32499, loss: -6.294743537902832, rewards: 8.399999999999986, count: 86
epoch: 32509, loss: 0.27784213423728943, rewards: 8.399999999999986, count: 86
epoch: 32519, loss: 2.241762638092041, rewards: 40.90000000000026, count: 348
epoch: 32529, loss: 3.325201988220215, rewards: 64.400000

epoch: 33449, loss: 4.23119592666626, rewards: 54.90000000000042, count: 461
epoch: 33459, loss: 2.4216701984405518, rewards: 54.90000000000042, count: 461
epoch: 33469, loss: 3.1728532314300537, rewards: 8.399999999999986, count: 86
epoch: 33479, loss: -6.220579624176025, rewards: 41.00000000000026, count: 349
epoch: 33489, loss: -2.169816255569458, rewards: 4.899999999999995, count: 60
epoch: 33499, loss: 0.6339800357818604, rewards: 4.299999999999997, count: 54
epoch: 33509, loss: -1.529746174812317, rewards: 27.10000000000011, count: 237
epoch: 33519, loss: 1.3199162483215332, rewards: 12.899999999999974, count: 122
epoch: 33529, loss: 2.774533987045288, rewards: 31.900000000000162, count: 276
epoch: 33539, loss: 5.546832084655762, rewards: 12.899999999999974, count: 122
epoch: 33549, loss: -1.4174996614456177, rewards: 27.300000000000114, count: 239
epoch: 33559, loss: 1.1946218013763428, rewards: 12.899999999999974, count: 122
epoch: 33569, loss: 2.6399083137512207, rewards: 17.9

epoch: 34489, loss: 5.4941725730896, rewards: 92.39999999999917, count: 764
epoch: 34499, loss: -0.3099120855331421, rewards: 50.70000000000037, count: 428
epoch: 34509, loss: 0.26770320534706116, rewards: 22.40000000000006, count: 199
epoch: 34519, loss: 2.1282825469970703, rewards: 36.40000000000021, count: 312
epoch: 34529, loss: -0.1344401091337204, rewards: 9.099999999999984, count: 93
epoch: 34539, loss: 1.514686107635498, rewards: 40.90000000000026, count: 348
epoch: 34549, loss: -0.01401912048459053, rewards: 8.399999999999986, count: 86
epoch: 34559, loss: -0.20493623614311218, rewards: 12.999999999999973, count: 123
epoch: 34569, loss: -1.4022760391235352, rewards: 12.999999999999973, count: 123
epoch: 34579, loss: 0.3835504651069641, rewards: 22.40000000000006, count: 199
epoch: 34589, loss: -2.7114648818969727, rewards: 74.69999999999995, count: 623
epoch: 34599, loss: 0.6419631838798523, rewards: 3.8999999999999986, count: 50
epoch: 34609, loss: -1.2658846378326416, reward

epoch: 35529, loss: 1.1550941467285156, rewards: 22.50000000000006, count: 200
epoch: 35539, loss: 4.330668926239014, rewards: 8.399999999999986, count: 86
epoch: 35549, loss: 0.05355316400527954, rewards: 4.599999999999996, count: 57
epoch: 35559, loss: 0.6697390675544739, rewards: 6.399999999999993, count: 66
epoch: 35569, loss: -7.13800573348999, rewards: 41.70000000000027, count: 356
epoch: 35579, loss: 3.327706813812256, rewards: 8.699999999999985, count: 89
epoch: 35589, loss: 1.723009467124939, rewards: 22.40000000000006, count: 199
epoch: 35599, loss: -0.07614564895629883, rewards: 65.10000000000038, count: 545
epoch: 35609, loss: 1.9270501136779785, rewards: 22.40000000000006, count: 199
epoch: 35619, loss: -0.20994798839092255, rewards: 8.799999999999985, count: 90
epoch: 35629, loss: -2.0541188716888428, rewards: 8.499999999999986, count: 87
epoch: 35639, loss: 5.1385498046875, rewards: 60.80000000000049, count: 511
epoch: 35649, loss: 0.6786987781524658, rewards: 78.3999999

epoch: 36569, loss: 2.171408176422119, rewards: 26.90000000000011, count: 235
epoch: 36579, loss: 9.232398986816406, rewards: 6.399999999999993, count: 66
epoch: 36589, loss: 1.42181396484375, rewards: 8.599999999999985, count: 88
epoch: 36599, loss: 0.32937878370285034, rewards: 3.8999999999999986, count: 50
epoch: 36609, loss: 1.5836776494979858, rewards: 3.8999999999999986, count: 50
epoch: 36619, loss: 1.5946805477142334, rewards: 6.299999999999994, count: 65
epoch: 36629, loss: 2.0475950241088867, rewards: 50.40000000000037, count: 425
epoch: 36639, loss: 1.5671929121017456, rewards: 22.40000000000006, count: 199
epoch: 36649, loss: 3.5573973655700684, rewards: 41.100000000000264, count: 350
epoch: 36659, loss: 4.933834552764893, rewards: 22.40000000000006, count: 199
epoch: 36669, loss: 2.3900675773620605, rewards: 54.90000000000042, count: 461
epoch: 36679, loss: -0.7929632663726807, rewards: 8.999999999999984, count: 92
epoch: 36689, loss: 0.8843176364898682, rewards: 12.899999

epoch: 37609, loss: 0.9369723796844482, rewards: 64.40000000000042, count: 538
epoch: 37619, loss: 4.319950103759766, rewards: 22.800000000000065, count: 203
epoch: 37629, loss: -1.4786121845245361, rewards: 8.499999999999986, count: 87
epoch: 37639, loss: -3.3685011863708496, rewards: 14.99999999999997, count: 134
epoch: 37649, loss: 1.4120004177093506, rewards: 22.40000000000006, count: 199
epoch: 37659, loss: -0.9957587718963623, rewards: 12.999999999999973, count: 123
epoch: 37669, loss: 1.6744801998138428, rewards: 26.90000000000011, count: 235
epoch: 37679, loss: -1.3179665803909302, rewards: 50.40000000000037, count: 425
epoch: 37689, loss: -0.7481474876403809, rewards: 50.40000000000037, count: 425
epoch: 37699, loss: 2.3805694580078125, rewards: 36.40000000000021, count: 312
epoch: 37709, loss: 2.02553653717041, rewards: 50.40000000000037, count: 425
epoch: 37719, loss: -1.7771867513656616, rewards: 6.5999999999999925, count: 68
epoch: 37729, loss: -2.7950263023376465, rewards

epoch: 38649, loss: 4.313085079193115, rewards: 31.900000000000162, count: 276
epoch: 38659, loss: 3.043397903442383, rewards: 22.40000000000006, count: 199
epoch: 38669, loss: 4.3771209716796875, rewards: 12.899999999999974, count: 122
epoch: 38679, loss: -0.7394389510154724, rewards: 12.899999999999974, count: 122
epoch: 38689, loss: 0.06322695314884186, rewards: 8.399999999999986, count: 86
epoch: 38699, loss: 0.1478601098060608, rewards: 12.899999999999974, count: 122
epoch: 38709, loss: 1.1310843229293823, rewards: 8.599999999999985, count: 88
epoch: 38719, loss: 2.4192631244659424, rewards: 26.90000000000011, count: 235
epoch: 38729, loss: -0.4440041780471802, rewards: 26.90000000000011, count: 235
epoch: 38739, loss: 0.5411017537117004, rewards: 8.399999999999986, count: 86
epoch: 38749, loss: 0.39301797747612, rewards: 8.399999999999986, count: 86
epoch: 38759, loss: 1.157911777496338, rewards: 8.599999999999985, count: 88
epoch: 38769, loss: -0.6661261916160583, rewards: 6.599

epoch: 39689, loss: -5.3169355392456055, rewards: 41.800000000000274, count: 357
epoch: 39699, loss: 2.2606568336486816, rewards: 40.90000000000026, count: 348
epoch: 39709, loss: -2.070769786834717, rewards: 27.00000000000011, count: 236
epoch: 39719, loss: -1.0572093725204468, rewards: 78.39999999999979, count: 651
epoch: 39729, loss: 2.1499226093292236, rewards: 68.90000000000022, count: 574
epoch: 39739, loss: -4.600516319274902, rewards: 46.70000000000033, count: 397
epoch: 39749, loss: -0.6235978603363037, rewards: 64.40000000000042, count: 538
epoch: 39759, loss: 5.816143989562988, rewards: 78.39999999999979, count: 651
epoch: 39769, loss: -3.2396082878112793, rewards: 22.40000000000006, count: 199
epoch: 39779, loss: -6.869331359863281, rewards: 12.999999999999973, count: 123
epoch: 39789, loss: -4.55935001373291, rewards: 26.90000000000011, count: 235
epoch: 39799, loss: 0.9608839750289917, rewards: 8.399999999999986, count: 86
epoch: 39809, loss: -1.9851491451263428, rewards:

epoch: 40729, loss: -9.54530143737793, rewards: 27.10000000000011, count: 237
epoch: 40739, loss: 3.95025634765625, rewards: 110.89999999999834, count: 913
epoch: 40749, loss: 0.726721465587616, rewards: 26.90000000000011, count: 235
epoch: 40759, loss: -1.0638073682785034, rewards: 8.399999999999986, count: 86
epoch: 40769, loss: -5.797787189483643, rewards: 20.800000000000036, count: 183
epoch: 40779, loss: 1.1924126148223877, rewards: 12.899999999999974, count: 122
epoch: 40789, loss: 0.293142169713974, rewards: 8.999999999999984, count: 92
epoch: 40799, loss: 1.4892420768737793, rewards: 50.40000000000037, count: 425
epoch: 40809, loss: 5.632964134216309, rewards: 50.40000000000037, count: 425
epoch: 40819, loss: 2.8263278007507324, rewards: 22.40000000000006, count: 199
epoch: 40829, loss: 1.4445788860321045, rewards: 8.399999999999986, count: 86
epoch: 40839, loss: 1.533639669418335, rewards: 8.399999999999986, count: 86
epoch: 40849, loss: 0.33760762214660645, rewards: 8.3999999

epoch: 41769, loss: -2.1674442291259766, rewards: 78.89999999999976, count: 656
epoch: 41779, loss: 1.016796588897705, rewards: 12.899999999999974, count: 122
epoch: 41789, loss: 3.635801076889038, rewards: 50.40000000000037, count: 425
epoch: 41799, loss: -5.404351234436035, rewards: 97.9999999999989, count: 811
epoch: 41809, loss: 2.191835641860962, rewards: 40.90000000000026, count: 348
epoch: 41819, loss: 1.566814661026001, rewards: 36.40000000000021, count: 312
epoch: 41829, loss: -13.677181243896484, rewards: 26.90000000000011, count: 235
epoch: 41839, loss: -5.766258716583252, rewards: 24.90000000000008, count: 215
epoch: 41849, loss: -1.692706823348999, rewards: 26.90000000000011, count: 235
epoch: 41859, loss: 0.6666521430015564, rewards: 8.399999999999986, count: 86
epoch: 41869, loss: 0.725138783454895, rewards: 8.399999999999986, count: 86
epoch: 41879, loss: -1.3335704803466797, rewards: 26.90000000000011, count: 235
epoch: 41889, loss: 0.17735415697097778, rewards: 6.7999

epoch: 42809, loss: -0.8689330220222473, rewards: 4.299999999999997, count: 54
epoch: 42819, loss: -0.6874544620513916, rewards: 12.899999999999974, count: 122
epoch: 42829, loss: 3.898317337036133, rewards: 50.40000000000037, count: 425
epoch: 42839, loss: -0.6604461669921875, rewards: 36.40000000000021, count: 312
epoch: 42849, loss: 2.587282180786133, rewards: 12.899999999999974, count: 122
epoch: 42859, loss: -7.144203186035156, rewards: 82.8999999999996, count: 687
epoch: 42869, loss: 4.153095722198486, rewards: 36.40000000000021, count: 312
epoch: 42879, loss: 6.00985860824585, rewards: 10.799999999999981, count: 101
epoch: 42889, loss: 4.467499732971191, rewards: 27.900000000000123, count: 245
epoch: 42899, loss: -3.5708200931549072, rewards: 23.30000000000007, count: 208
epoch: 42909, loss: 1.9130325317382812, rewards: 8.399999999999986, count: 86
epoch: 42919, loss: -1.1136823892593384, rewards: 22.40000000000006, count: 199
epoch: 42929, loss: 0.7200493216514587, rewards: 12.

epoch: 43849, loss: -1.1689890623092651, rewards: 22.40000000000006, count: 199
epoch: 43859, loss: 3.656120777130127, rewards: 26.90000000000011, count: 235
epoch: 43869, loss: -4.806471347808838, rewards: 40.90000000000026, count: 348
epoch: 43879, loss: -5.633748531341553, rewards: 12.899999999999974, count: 122
epoch: 43889, loss: 5.779407501220703, rewards: 17.90000000000001, count: 163
epoch: 43899, loss: 3.4260478019714355, rewards: 12.899999999999974, count: 122
epoch: 43909, loss: 2.9517838954925537, rewards: 92.39999999999917, count: 764
epoch: 43919, loss: 1.810032606124878, rewards: 36.40000000000021, count: 312
epoch: 43929, loss: 0.5136510133743286, rewards: 6.799999999999992, count: 70
epoch: 43939, loss: 0.7811428308486938, rewards: 12.899999999999974, count: 122
epoch: 43949, loss: 7.638484954833984, rewards: 26.90000000000011, count: 235
epoch: 43959, loss: 1.4797662496566772, rewards: 14.99999999999997, count: 134
epoch: 43969, loss: -5.48175048828125, rewards: 6.799

epoch: 44889, loss: 0.30425190925598145, rewards: 8.399999999999986, count: 86
epoch: 44899, loss: 5.68715763092041, rewards: 12.899999999999974, count: 122
epoch: 44909, loss: -0.09765195846557617, rewards: 4.299999999999997, count: 54
epoch: 44919, loss: 1.2061651945114136, rewards: 17.90000000000001, count: 163
epoch: 44929, loss: 7.67390775680542, rewards: 54.90000000000042, count: 461
epoch: 44939, loss: -0.1449277400970459, rewards: 6.399999999999993, count: 66
epoch: 44949, loss: 1.3786921501159668, rewards: 12.899999999999974, count: 122
epoch: 44959, loss: 1.2021064758300781, rewards: 13.499999999999972, count: 128
epoch: 44969, loss: 4.393946170806885, rewards: 10.599999999999982, count: 99
epoch: 44979, loss: 9.184005737304688, rewards: 36.50000000000021, count: 313
epoch: 44989, loss: 0.34464216232299805, rewards: 12.899999999999974, count: 122
epoch: 44999, loss: 1.5407400131225586, rewards: 82.8999999999996, count: 687
epoch: 45009, loss: 0.9786669015884399, rewards: 8.39

epoch: 45929, loss: -5.097573757171631, rewards: 69.70000000000017, count: 582
epoch: 45939, loss: 1.0192785263061523, rewards: 40.90000000000026, count: 348
epoch: 45949, loss: 0.030666351318359375, rewards: 122.49999999999785, count: 1000
epoch: 45959, loss: -1.0281414985656738, rewards: 22.700000000000063, count: 202
epoch: 45969, loss: 3.285306930541992, rewards: 12.899999999999974, count: 122
epoch: 45979, loss: -3.4127492904663086, rewards: 40.90000000000026, count: 348
epoch: 45989, loss: -7.1400675773620605, rewards: 23.000000000000068, count: 205
epoch: 45999, loss: -2.8318018913269043, rewards: 122.49999999999785, count: 1000
epoch: 46009, loss: -17.01276397705078, rewards: 36.600000000000215, count: 314
epoch: 46019, loss: -25.707555770874023, rewards: 121.29999999999787, count: 999
epoch: 46029, loss: -11.156332015991211, rewards: 36.50000000000021, count: 313
epoch: 46039, loss: -1.7292747497558594, rewards: 36.40000000000021, count: 312
epoch: 46049, loss: -9.990856170654

epoch: 46959, loss: -5.201713562011719, rewards: 8.399999999999986, count: 86
epoch: 46969, loss: -10.167583465576172, rewards: 22.800000000000065, count: 203
epoch: 46979, loss: -2.5800395011901855, rewards: 13.199999999999973, count: 125
epoch: 46989, loss: -7.6765923500061035, rewards: 8.699999999999985, count: 89
epoch: 46999, loss: -3.818751573562622, rewards: 32.40000000000017, count: 281
epoch: 47009, loss: -8.736973762512207, rewards: 88.59999999999933, count: 735
epoch: 47019, loss: 1.7661539316177368, rewards: 12.899999999999974, count: 122
epoch: 47029, loss: 0.16119909286499023, rewards: 82.8999999999996, count: 687
epoch: 47039, loss: -7.510813236236572, rewards: 40.90000000000026, count: 348
epoch: 47049, loss: -2.127807140350342, rewards: 27.200000000000113, count: 238
epoch: 47059, loss: -1.1948875188827515, rewards: 8.999999999999984, count: 92
epoch: 47069, loss: -1.039578914642334, rewards: 93.09999999999913, count: 771
epoch: 47079, loss: 0.31323373317718506, reward

epoch: 47989, loss: -3.1691434383392334, rewards: 8.599999999999985, count: 88
epoch: 47999, loss: 2.6297850608825684, rewards: 122.49999999999785, count: 1000
epoch: 48009, loss: -22.428253173828125, rewards: 90.39999999999928, count: 744
epoch: 48019, loss: 0.7984891533851624, rewards: 22.900000000000066, count: 204
epoch: 48029, loss: -1.6099814176559448, rewards: 4.699999999999996, count: 58
epoch: 48039, loss: -3.919074535369873, rewards: 88.89999999999931, count: 738
epoch: 48049, loss: -6.495848655700684, rewards: 26.90000000000011, count: 235
epoch: 48059, loss: -0.9282248020172119, rewards: 68.90000000000022, count: 574
epoch: 48069, loss: 1.9582586288452148, rewards: 26.90000000000011, count: 235
epoch: 48079, loss: -4.05925989151001, rewards: 82.8999999999996, count: 687
epoch: 48089, loss: -5.871802806854248, rewards: 45.90000000000032, count: 389
epoch: 48099, loss: 3.055534839630127, rewards: 122.49999999999785, count: 1000
epoch: 48109, loss: -25.473722457885742, rewards

epoch: 49019, loss: -6.327208042144775, rewards: 17.90000000000001, count: 163
epoch: 49029, loss: -0.1143110990524292, rewards: 51.30000000000038, count: 434
epoch: 49039, loss: -2.6705515384674072, rewards: 13.89999999999997, count: 132
epoch: 49049, loss: 4.106534957885742, rewards: 122.49999999999785, count: 1000
epoch: 49059, loss: -5.4847869873046875, rewards: 96.89999999999897, count: 800
epoch: 49069, loss: -5.563162326812744, rewards: 13.299999999999972, count: 126
epoch: 49079, loss: 3.076167345046997, rewards: 82.8999999999996, count: 687
epoch: 49089, loss: -0.15380346775054932, rewards: 82.8999999999996, count: 687
epoch: 49099, loss: -0.2638131380081177, rewards: 12.899999999999974, count: 122
epoch: 49109, loss: -1.66419517993927, rewards: 8.599999999999985, count: 88
epoch: 49119, loss: 1.7891337871551514, rewards: 39.100000000000236, count: 330
epoch: 49129, loss: 1.9336152076721191, rewards: 122.49999999999785, count: 1000
epoch: 49139, loss: -7.684728622436523, rewar

epoch: 50049, loss: -2.5318946838378906, rewards: 54.90000000000042, count: 461
epoch: 50059, loss: -4.64167594909668, rewards: 22.700000000000063, count: 202
epoch: 50069, loss: -2.311396598815918, rewards: 122.49999999999785, count: 1000
epoch: 50079, loss: -10.990028381347656, rewards: 120.39999999999792, count: 990
epoch: 50089, loss: -7.794826984405518, rewards: 37.20000000000022, count: 320
epoch: 50099, loss: -18.60331153869629, rewards: 111.19999999999833, count: 916
epoch: 50109, loss: -18.688520431518555, rewards: 54.90000000000042, count: 461
epoch: 50119, loss: 0.23402893543243408, rewards: 40.90000000000026, count: 348
epoch: 50129, loss: -13.586495399475098, rewards: 27.10000000000011, count: 237
epoch: 50139, loss: -1.6730296611785889, rewards: 20.700000000000035, count: 182
epoch: 50149, loss: 0.4036656618118286, rewards: 101.89999999999874, count: 841
epoch: 50159, loss: -3.0665364265441895, rewards: 122.49999999999785, count: 1000
epoch: 50169, loss: 2.677922725677490

epoch: 51079, loss: -1.0704715251922607, rewards: 11.09999999999998, count: 104
epoch: 51089, loss: -8.412489891052246, rewards: 36.40000000000021, count: 312
epoch: 51099, loss: -20.135616302490234, rewards: 22.40000000000006, count: 199
epoch: 51109, loss: -5.840038299560547, rewards: 25.00000000000008, count: 216
epoch: 51119, loss: -18.172239303588867, rewards: 46.50000000000033, count: 395
epoch: 51129, loss: -0.10463201999664307, rewards: 8.399999999999986, count: 86
epoch: 51139, loss: -4.269996643066406, rewards: 122.49999999999785, count: 1000
epoch: 51149, loss: -0.021706342697143555, rewards: 122.49999999999785, count: 1000
epoch: 51159, loss: -11.414331436157227, rewards: 25.300000000000086, count: 219
epoch: 51169, loss: 1.7979347705841064, rewards: 13.399999999999972, count: 127
epoch: 51179, loss: 2.4768335819244385, rewards: 110.89999999999834, count: 913
epoch: 51189, loss: -7.943777561187744, rewards: 78.49999999999979, count: 652
epoch: 51199, loss: 4.386593818664551

epoch: 52109, loss: 4.320688247680664, rewards: 8.399999999999986, count: 86
epoch: 52119, loss: -9.49194622039795, rewards: 55.70000000000043, count: 469
epoch: 52129, loss: -6.780430793762207, rewards: 69.2000000000002, count: 577
epoch: 52139, loss: -2.581392288208008, rewards: 51.600000000000385, count: 437
epoch: 52149, loss: -3.9377403259277344, rewards: 26.90000000000011, count: 235
epoch: 52159, loss: -5.275473594665527, rewards: 122.49999999999785, count: 1000
epoch: 52169, loss: 3.681119441986084, rewards: 12.899999999999974, count: 122
epoch: 52179, loss: -3.4909331798553467, rewards: 22.40000000000006, count: 199
epoch: 52189, loss: -0.26547425985336304, rewards: 40.90000000000026, count: 348
epoch: 52199, loss: -6.597242832183838, rewards: 71.30000000000014, count: 589
epoch: 52209, loss: -20.272480010986328, rewards: 78.39999999999979, count: 651
epoch: 52219, loss: -0.5524333119392395, rewards: 4.899999999999995, count: 60
epoch: 52229, loss: -9.710691452026367, rewards:

epoch: 53139, loss: -4.755020618438721, rewards: 36.40000000000021, count: 312
epoch: 53149, loss: -8.525954246520996, rewards: 24.90000000000008, count: 215
epoch: 53159, loss: -0.4862208366394043, rewards: 122.49999999999785, count: 1000
epoch: 53169, loss: -5.132149696350098, rewards: 13.199999999999973, count: 125
epoch: 53179, loss: 0.1898878812789917, rewards: 122.49999999999785, count: 1000
epoch: 53189, loss: -12.291873931884766, rewards: 122.49999999999785, count: 1000
epoch: 53199, loss: -14.277664184570312, rewards: 73.89999999999999, count: 615
epoch: 53209, loss: -3.0211939811706543, rewards: 122.49999999999785, count: 1000
epoch: 53219, loss: -8.885221481323242, rewards: 62.7000000000005, count: 521
epoch: 53229, loss: 1.388936161994934, rewards: 42.10000000000028, count: 360
epoch: 53239, loss: -3.0291388034820557, rewards: 82.8999999999996, count: 687
epoch: 53249, loss: 4.46918249130249, rewards: 122.49999999999785, count: 1000
epoch: 53259, loss: -2.478593111038208, r

epoch: 54169, loss: -2.557699203491211, rewards: 60.000000000000476, count: 503
epoch: 54179, loss: 3.7375948429107666, rewards: 122.49999999999785, count: 1000
epoch: 54189, loss: -3.546574115753174, rewards: 68.90000000000022, count: 574
epoch: 54199, loss: -5.564859867095947, rewards: 29.40000000000013, count: 251
epoch: 54209, loss: -2.837965965270996, rewards: 6.5999999999999925, count: 68
epoch: 54219, loss: -3.9314706325531006, rewards: 41.800000000000274, count: 357
epoch: 54229, loss: -7.724370002746582, rewards: 90.69999999999926, count: 747
epoch: 54239, loss: -2.5853543281555176, rewards: 122.49999999999785, count: 1000
epoch: 54249, loss: 0.2127489447593689, rewards: 40.90000000000026, count: 348
epoch: 54259, loss: -0.4083022475242615, rewards: 122.49999999999785, count: 1000
epoch: 54269, loss: -6.114821434020996, rewards: 59.900000000000475, count: 502
epoch: 54279, loss: -4.557427406311035, rewards: 27.300000000000114, count: 239
epoch: 54289, loss: -16.9698429107666, 

epoch: 55199, loss: -2.346684217453003, rewards: 122.49999999999785, count: 1000
epoch: 55209, loss: 0.13207578659057617, rewards: 12.899999999999974, count: 122
epoch: 55219, loss: 0.3890906572341919, rewards: 51.20000000000038, count: 433
epoch: 55229, loss: 3.691002130508423, rewards: 122.49999999999785, count: 1000
epoch: 55239, loss: 4.539862155914307, rewards: 12.899999999999974, count: 122
epoch: 55249, loss: -2.9062821865081787, rewards: 13.499999999999972, count: 128
epoch: 55259, loss: -3.2542011737823486, rewards: 50.40000000000037, count: 425
epoch: 55269, loss: -19.380342483520508, rewards: 73.89999999999999, count: 615
epoch: 55279, loss: 4.2409563064575195, rewards: 8.399999999999986, count: 86
epoch: 55289, loss: 1.9180411100387573, rewards: 8.399999999999986, count: 86
epoch: 55299, loss: -9.927629470825195, rewards: 68.90000000000022, count: 574
epoch: 55309, loss: 1.034490942955017, rewards: 26.90000000000011, count: 235
epoch: 55319, loss: -2.7337732315063477, rewar

epoch: 56229, loss: -12.423779487609863, rewards: 76.79999999999988, count: 635
epoch: 56239, loss: -0.8570585250854492, rewards: 32.60000000000017, count: 283
epoch: 56249, loss: 3.0628600120544434, rewards: 9.499999999999982, count: 97
epoch: 56259, loss: 0.27746737003326416, rewards: 8.599999999999985, count: 88
epoch: 56269, loss: -4.590113162994385, rewards: 64.90000000000039, count: 543
epoch: 56279, loss: 1.9055633544921875, rewards: 96.89999999999897, count: 800
epoch: 56289, loss: 2.905517816543579, rewards: 31.900000000000162, count: 276
epoch: 56299, loss: 1.8048081398010254, rewards: 54.90000000000042, count: 461
epoch: 56309, loss: -23.88072967529297, rewards: 85.29999999999951, count: 702
epoch: 56319, loss: 0.6601009964942932, rewards: 26.90000000000011, count: 235
epoch: 56329, loss: -9.579288482666016, rewards: 15.499999999999975, count: 139
epoch: 56339, loss: 3.862318277359009, rewards: 12.899999999999974, count: 122
epoch: 56349, loss: 1.2092397212982178, rewards: 1

epoch: 57269, loss: -17.75750160217285, rewards: 106.49999999999854, count: 878
epoch: 57279, loss: -2.0612072944641113, rewards: 122.49999999999785, count: 1000
epoch: 57289, loss: -4.683984756469727, rewards: 22.800000000000065, count: 203
epoch: 57299, loss: -3.968695640563965, rewards: 12.899999999999974, count: 122
epoch: 57309, loss: -3.714188814163208, rewards: 8.699999999999985, count: 89
epoch: 57319, loss: -2.066396713256836, rewards: 122.49999999999785, count: 1000
epoch: 57329, loss: 0.11463484168052673, rewards: 4.599999999999996, count: 57
epoch: 57339, loss: 2.7510247230529785, rewards: 8.399999999999986, count: 86
epoch: 57349, loss: 0.4821133613586426, rewards: 122.49999999999785, count: 1000
epoch: 57359, loss: -9.106158256530762, rewards: 69.00000000000021, count: 575
epoch: 57369, loss: 4.346589088439941, rewards: 8.399999999999986, count: 86
epoch: 57379, loss: 4.251248836517334, rewards: 8.399999999999986, count: 86
epoch: 57389, loss: 1.9799580574035645, rewards:

epoch: 58299, loss: -3.6097888946533203, rewards: 122.49999999999785, count: 1000
epoch: 58309, loss: 4.36491060256958, rewards: 25.300000000000086, count: 219
epoch: 58319, loss: 4.746364593505859, rewards: 6.299999999999994, count: 65
epoch: 58329, loss: 2.309998035430908, rewards: 6.5999999999999925, count: 68
epoch: 58339, loss: -2.4251482486724854, rewards: 6.5999999999999925, count: 68
epoch: 58349, loss: 1.1566259860992432, rewards: 3.9999999999999982, count: 51
epoch: 58359, loss: -0.5444607734680176, rewards: 18.300000000000015, count: 167
epoch: 58369, loss: -2.1573684215545654, rewards: 13.69999999999997, count: 130
epoch: 58379, loss: 1.1554489135742188, rewards: 26.90000000000011, count: 235
epoch: 58389, loss: -9.655359268188477, rewards: 13.099999999999973, count: 124
epoch: 58399, loss: 8.19179916381836, rewards: 122.49999999999785, count: 1000
epoch: 58409, loss: -8.006675720214844, rewards: 65.20000000000037, count: 546
epoch: 58419, loss: 1.5063332319259644, rewards:

epoch: 59339, loss: -2.704047203063965, rewards: 37.300000000000225, count: 321
epoch: 59349, loss: -12.124260902404785, rewards: 106.79999999999852, count: 881
epoch: 59359, loss: 1.0773292779922485, rewards: 12.899999999999974, count: 122
epoch: 59369, loss: 3.8292367458343506, rewards: 122.49999999999785, count: 1000
epoch: 59379, loss: -1.53343665599823, rewards: 3.8999999999999986, count: 50
epoch: 59389, loss: -6.2453765869140625, rewards: 32.70000000000017, count: 284
epoch: 59399, loss: 6.5986175537109375, rewards: 92.39999999999917, count: 764
epoch: 59409, loss: -2.2598230838775635, rewards: 3.8999999999999986, count: 50
epoch: 59419, loss: 1.3260972499847412, rewards: 68.90000000000022, count: 574
epoch: 59429, loss: -8.412999153137207, rewards: 92.99999999999913, count: 770
epoch: 59439, loss: -2.3846359252929688, rewards: 40.90000000000026, count: 348
epoch: 59449, loss: 3.601531505584717, rewards: 31.900000000000162, count: 276
epoch: 59459, loss: -0.12546539306640625, re

epoch: 60369, loss: 1.3360333442687988, rewards: 26.90000000000011, count: 235
epoch: 60379, loss: -15.430420875549316, rewards: 79.29999999999974, count: 660
epoch: 60389, loss: -10.938276290893555, rewards: 53.00000000000039, count: 442
epoch: 60399, loss: -0.0028135180473327637, rewards: 27.60000000000012, count: 242
epoch: 60409, loss: -1.0257607698440552, rewards: 3.9999999999999982, count: 51
epoch: 60419, loss: 6.735902786254883, rewards: 122.49999999999785, count: 1000
epoch: 60429, loss: 2.252256393432617, rewards: 12.899999999999974, count: 122
epoch: 60439, loss: -5.090937614440918, rewards: 59.900000000000475, count: 502
epoch: 60449, loss: 5.350939750671387, rewards: 122.49999999999785, count: 1000
epoch: 60459, loss: -0.06645864248275757, rewards: 40.90000000000026, count: 348
epoch: 60469, loss: 1.1380455493927002, rewards: 110.89999999999834, count: 913
epoch: 60479, loss: -0.5070271492004395, rewards: 90.69999999999926, count: 747
epoch: 60489, loss: -7.113240718841553

epoch: 61399, loss: -7.350118637084961, rewards: 22.700000000000063, count: 202
epoch: 61409, loss: -0.13963472843170166, rewards: 15.399999999999974, count: 138
epoch: 61419, loss: -0.5246254801750183, rewards: 12.899999999999974, count: 122
epoch: 61429, loss: 2.9600019454956055, rewards: 68.90000000000022, count: 574
epoch: 61439, loss: 8.961776733398438, rewards: 101.89999999999874, count: 841
epoch: 61449, loss: 0.5933877229690552, rewards: 68.90000000000022, count: 574
epoch: 61459, loss: 0.905219554901123, rewards: 12.899999999999974, count: 122
epoch: 61469, loss: -0.05693836510181427, rewards: 12.899999999999974, count: 122
epoch: 61479, loss: -4.27495813369751, rewards: 27.400000000000116, count: 240
epoch: 61489, loss: -2.264617443084717, rewards: 54.90000000000042, count: 461
epoch: 61499, loss: 3.0584065914154053, rewards: 54.90000000000042, count: 461
epoch: 61509, loss: -3.614469051361084, rewards: 88.59999999999933, count: 735
epoch: 61519, loss: 3.115212917327881, rewa

epoch: 62429, loss: 2.1360771656036377, rewards: 122.49999999999785, count: 1000
epoch: 62439, loss: 3.9663870334625244, rewards: 122.49999999999785, count: 1000
epoch: 62449, loss: 8.722448348999023, rewards: 122.49999999999785, count: 1000
epoch: 62459, loss: 5.75640344619751, rewards: 17.90000000000001, count: 163
epoch: 62469, loss: 1.6269056797027588, rewards: 122.49999999999785, count: 1000
epoch: 62479, loss: 1.11021089553833, rewards: 122.49999999999785, count: 1000
epoch: 62489, loss: 0.9987194538116455, rewards: 122.49999999999785, count: 1000
epoch: 62499, loss: -20.959434509277344, rewards: 101.99999999999874, count: 842
epoch: 62509, loss: 1.0460896492004395, rewards: 8.899999999999984, count: 91
epoch: 62519, loss: 0.8444697856903076, rewards: 4.4999999999999964, count: 56
epoch: 62529, loss: 2.865980625152588, rewards: 11.09999999999998, count: 104
epoch: 62539, loss: -1.7168774604797363, rewards: 46.20000000000032, count: 392
epoch: 62549, loss: -1.1197924613952637, rew

epoch: 63469, loss: -0.6335871815681458, rewards: 12.899999999999974, count: 122
epoch: 63479, loss: -3.4374561309814453, rewards: 8.699999999999985, count: 89
epoch: 63489, loss: 0.14267921447753906, rewards: 122.49999999999785, count: 1000
epoch: 63499, loss: -3.693943738937378, rewards: 82.8999999999996, count: 687
epoch: 63509, loss: 0.8634811639785767, rewards: 11.19999999999998, count: 105
epoch: 63519, loss: 3.8377737998962402, rewards: 40.90000000000026, count: 348
epoch: 63529, loss: -4.252503871917725, rewards: 22.40000000000006, count: 199
epoch: 63539, loss: -11.850787162780762, rewards: 47.000000000000334, count: 400
epoch: 63549, loss: -0.6947662234306335, rewards: 26.90000000000011, count: 235
epoch: 63559, loss: -1.647953987121582, rewards: 122.49999999999785, count: 1000
epoch: 63569, loss: 1.0324231386184692, rewards: 11.09999999999998, count: 104
epoch: 63579, loss: 1.050990104675293, rewards: 6.799999999999992, count: 70
epoch: 63589, loss: -4.8575263023376465, rewa

epoch: 64499, loss: 2.623328447341919, rewards: 12.899999999999974, count: 122
epoch: 64509, loss: 1.7311673164367676, rewards: 17.90000000000001, count: 163
epoch: 64519, loss: -5.1241374015808105, rewards: 32.200000000000166, count: 279
epoch: 64529, loss: -27.903066635131836, rewards: 87.89999999999937, count: 728
epoch: 64539, loss: -1.4783251285552979, rewards: 31.900000000000162, count: 276
epoch: 64549, loss: -4.555307388305664, rewards: 18.00000000000001, count: 164
epoch: 64559, loss: -10.567306518554688, rewards: 122.49999999999785, count: 1000
epoch: 64569, loss: 1.7701786756515503, rewards: 26.90000000000011, count: 235
epoch: 64579, loss: 4.1811347007751465, rewards: 31.900000000000162, count: 276
epoch: 64589, loss: 2.672698974609375, rewards: 40.90000000000026, count: 348
epoch: 64599, loss: -9.613795280456543, rewards: 94.5999999999991, count: 777
epoch: 64609, loss: -11.39040470123291, rewards: 34.20000000000018, count: 290
epoch: 64619, loss: -1.6161508560180664, rewa

epoch: 65529, loss: 0.21459245681762695, rewards: 22.40000000000006, count: 199
epoch: 65539, loss: -1.836900234222412, rewards: 64.90000000000039, count: 543
epoch: 65549, loss: 6.727289199829102, rewards: 26.90000000000011, count: 235
epoch: 65559, loss: -0.5112237334251404, rewards: 22.40000000000006, count: 199
epoch: 65569, loss: 1.976557970046997, rewards: 26.90000000000011, count: 235
epoch: 65579, loss: -0.7283045649528503, rewards: 78.39999999999979, count: 651
epoch: 65589, loss: 0.17368218302726746, rewards: 3.8999999999999986, count: 50
epoch: 65599, loss: 4.164046287536621, rewards: 96.89999999999897, count: 800
epoch: 65609, loss: 4.936396598815918, rewards: 73.89999999999999, count: 615
epoch: 65619, loss: -6.0264739990234375, rewards: 83.79999999999954, count: 696
epoch: 65629, loss: -6.502377033233643, rewards: 122.49999999999785, count: 1000
epoch: 65639, loss: -0.5350207686424255, rewards: 54.90000000000042, count: 461
epoch: 65649, loss: 6.7047929763793945, rewards:

epoch: 66569, loss: 2.762326717376709, rewards: 82.8999999999996, count: 687
epoch: 66579, loss: 3.336282730102539, rewards: 122.49999999999785, count: 1000
epoch: 66589, loss: -0.997633695602417, rewards: 122.49999999999785, count: 1000
epoch: 66599, loss: -14.890999794006348, rewards: 118.39999999999803, count: 970
epoch: 66609, loss: 4.1476898193359375, rewards: 22.40000000000006, count: 199
epoch: 66619, loss: -2.2613086700439453, rewards: 12.899999999999974, count: 122
epoch: 66629, loss: -3.212212562561035, rewards: 122.49999999999785, count: 1000
epoch: 66639, loss: -17.61789321899414, rewards: 106.59999999999853, count: 879
epoch: 66649, loss: 3.323331594467163, rewards: 54.90000000000042, count: 461
epoch: 66659, loss: 4.354866027832031, rewards: 12.899999999999974, count: 122
epoch: 66669, loss: -0.669205904006958, rewards: 106.39999999999854, count: 877
epoch: 66679, loss: -1.9523816108703613, rewards: 62.8000000000005, count: 522
epoch: 66689, loss: 2.226287841796875, rewar

epoch: 67599, loss: 2.3283674716949463, rewards: 122.49999999999785, count: 1000
epoch: 67609, loss: -8.152311325073242, rewards: 110.89999999999834, count: 913
epoch: 67619, loss: -2.3261303901672363, rewards: 122.49999999999785, count: 1000
epoch: 67629, loss: 6.920289993286133, rewards: 15.19999999999997, count: 136
epoch: 67639, loss: -7.602038383483887, rewards: 122.49999999999785, count: 1000
epoch: 67649, loss: 3.155254364013672, rewards: 40.90000000000026, count: 348
epoch: 67659, loss: -3.402759075164795, rewards: 122.49999999999785, count: 1000
epoch: 67669, loss: 4.08493709564209, rewards: 8.399999999999986, count: 86
epoch: 67679, loss: 2.2250163555145264, rewards: 40.90000000000026, count: 348
epoch: 67689, loss: 1.46628999710083, rewards: 122.49999999999785, count: 1000
epoch: 67699, loss: -10.342607498168945, rewards: 96.89999999999897, count: 800
epoch: 67709, loss: -7.087255001068115, rewards: 122.49999999999785, count: 1000
epoch: 67719, loss: 2.124251365661621, rewar

epoch: 68629, loss: -9.809694290161133, rewards: 122.49999999999785, count: 1000
epoch: 68639, loss: -1.7758522033691406, rewards: 85.39999999999951, count: 703
epoch: 68649, loss: -0.7372860908508301, rewards: 122.49999999999785, count: 1000
epoch: 68659, loss: 3.26222562789917, rewards: 122.49999999999785, count: 1000
epoch: 68669, loss: -4.798372268676758, rewards: 122.49999999999785, count: 1000
epoch: 68679, loss: -4.060297012329102, rewards: 22.700000000000063, count: 202
epoch: 68689, loss: 5.013842582702637, rewards: 26.90000000000011, count: 235
epoch: 68699, loss: -1.2786486148834229, rewards: 13.499999999999972, count: 128
epoch: 68709, loss: 5.221036434173584, rewards: 22.40000000000006, count: 199
epoch: 68719, loss: 1.582075595855713, rewards: 40.90000000000026, count: 348
epoch: 68729, loss: 3.213935136795044, rewards: 8.399999999999986, count: 86
epoch: 68739, loss: -0.7537124156951904, rewards: 122.49999999999785, count: 1000
epoch: 68749, loss: -0.407623291015625, rew

epoch: 69659, loss: -7.925774574279785, rewards: 14.99999999999997, count: 134
epoch: 69669, loss: 5.449563980102539, rewards: 122.49999999999785, count: 1000
epoch: 69679, loss: -0.6829848289489746, rewards: 47.000000000000334, count: 400
epoch: 69689, loss: 5.307504653930664, rewards: 64.40000000000042, count: 538
epoch: 69699, loss: -10.543134689331055, rewards: 8.999999999999984, count: 92
epoch: 69709, loss: 1.3276201486587524, rewards: 12.899999999999974, count: 122
epoch: 69719, loss: 3.4086568355560303, rewards: 54.90000000000042, count: 461
epoch: 69729, loss: -1.0385451316833496, rewards: 39.30000000000024, count: 332
epoch: 69739, loss: 1.0165619850158691, rewards: 40.90000000000026, count: 348
epoch: 69749, loss: -2.0303452014923096, rewards: 29.600000000000133, count: 253
epoch: 69759, loss: 5.372464656829834, rewards: 122.49999999999785, count: 1000
epoch: 69769, loss: -1.2079757452011108, rewards: 122.49999999999785, count: 1000
epoch: 69779, loss: -2.1463065147399902, r

epoch: 70699, loss: -2.1675543785095215, rewards: 110.89999999999834, count: 913
epoch: 70709, loss: 1.37119460105896, rewards: 12.899999999999974, count: 122
epoch: 70719, loss: 0.010648787021636963, rewards: 122.49999999999785, count: 1000
epoch: 70729, loss: 0.846243143081665, rewards: 13.299999999999972, count: 126
epoch: 70739, loss: -1.8434031009674072, rewards: 15.299999999999972, count: 137
epoch: 70749, loss: 4.349429607391357, rewards: 26.90000000000011, count: 235
epoch: 70759, loss: -1.0880177021026611, rewards: 34.600000000000186, count: 294
epoch: 70769, loss: 1.6704944372177124, rewards: 22.40000000000006, count: 199
epoch: 70779, loss: 1.3106545209884644, rewards: 122.49999999999785, count: 1000
epoch: 70789, loss: -12.874114990234375, rewards: 106.99999999999851, count: 883
epoch: 70799, loss: -13.54542064666748, rewards: 116.69999999999807, count: 962
epoch: 70809, loss: -9.118646621704102, rewards: 83.89999999999954, count: 697
epoch: 70819, loss: 4.257366180419922, 

epoch: 71729, loss: -1.3359346389770508, rewards: 40.90000000000026, count: 348
epoch: 71739, loss: -1.8095488548278809, rewards: 40.90000000000026, count: 348
epoch: 71749, loss: -3.2486531734466553, rewards: 24.90000000000008, count: 215
epoch: 71759, loss: 4.378616809844971, rewards: 8.399999999999986, count: 86
epoch: 71769, loss: 0.3974481225013733, rewards: 4.899999999999995, count: 60
epoch: 71779, loss: 0.17880240082740784, rewards: 53.10000000000039, count: 443
epoch: 71789, loss: -1.959162712097168, rewards: 96.89999999999897, count: 800
epoch: 71799, loss: -1.351187825202942, rewards: 96.89999999999897, count: 800
epoch: 71809, loss: 4.413675785064697, rewards: 26.90000000000011, count: 235
epoch: 71819, loss: 0.3033159077167511, rewards: 4.4999999999999964, count: 56
epoch: 71829, loss: -2.044851779937744, rewards: 50.800000000000374, count: 429
epoch: 71839, loss: -4.577502250671387, rewards: 122.49999999999785, count: 1000
epoch: 71849, loss: -15.99254322052002, rewards: 

epoch: 72759, loss: -4.795592308044434, rewards: 80.89999999999971, count: 667
epoch: 72769, loss: -0.5181204080581665, rewards: 122.49999999999785, count: 1000
epoch: 72779, loss: 4.024538040161133, rewards: 8.399999999999986, count: 86
epoch: 72789, loss: -4.208961486816406, rewards: 26.90000000000011, count: 235
epoch: 72799, loss: 0.21716946363449097, rewards: 4.299999999999997, count: 54
epoch: 72809, loss: 2.062282085418701, rewards: 3.8999999999999986, count: 50
epoch: 72819, loss: 3.2956106662750244, rewards: 9.499999999999982, count: 97
epoch: 72829, loss: -0.4264216423034668, rewards: 82.8999999999996, count: 687
epoch: 72839, loss: -1.9006757736206055, rewards: 4.399999999999997, count: 55
epoch: 72849, loss: -0.2779277265071869, rewards: 4.699999999999996, count: 58
epoch: 72859, loss: -18.36713409423828, rewards: 111.19999999999833, count: 916
epoch: 72869, loss: -4.0966949462890625, rewards: 122.49999999999785, count: 1000
epoch: 72879, loss: -7.258112907409668, rewards: 

epoch: 73789, loss: 2.6088879108428955, rewards: 6.199999999999994, count: 64
epoch: 73799, loss: 2.5895004272460938, rewards: 122.49999999999785, count: 1000
epoch: 73809, loss: 2.3942010402679443, rewards: 122.49999999999785, count: 1000
epoch: 73819, loss: 1.021870732307434, rewards: 40.90000000000026, count: 348
epoch: 73829, loss: 3.154627561569214, rewards: 8.399999999999986, count: 86
epoch: 73839, loss: 0.09170675277709961, rewards: 122.49999999999785, count: 1000
epoch: 73849, loss: 4.784360408782959, rewards: 122.49999999999785, count: 1000
epoch: 73859, loss: 0.13446760177612305, rewards: 122.49999999999785, count: 1000
epoch: 73869, loss: -0.2986879348754883, rewards: 68.90000000000022, count: 574
epoch: 73879, loss: -0.13248825073242188, rewards: 122.49999999999785, count: 1000
epoch: 73889, loss: -2.3064515590667725, rewards: 122.49999999999785, count: 1000
epoch: 73899, loss: 1.0582129955291748, rewards: 20.600000000000033, count: 181
epoch: 73909, loss: 2.93669795989990

KeyboardInterrupt: 

In [92]:
# def sample_action(self,state):
#     probs = self.policy_net(state) # 4
#     if np.random.uniform() < 0.3:
#         action = np.random.randint(0,2)
#         return action, torch.log(probs[action]+1e-8)
#     dist = Categorical(probs)
#     action = dist.sample()
#     log_prob = dist.log_prob(action)
#     return action.item(),log_prob

# # 替换方法
# import types
# agent.sample_action = types.MethodType(sample_action, agent)

In [None]:
import time
def visualize_agent(env, agent, num_episodes=5):
    """
    渲染显示智能体的行动
    """
    env = gym.make('FlappyBird-v0', render_mode='human',use_lidar=False)  # 创建可视化环境
    
    for episode in range(num_episodes):
        state_tuple = env.reset()
        state = state_tuple[0] if isinstance(state_tuple, tuple) else state_tuple
        total_reward = 0
        steps = 0
        done = False
        
        print(f"\nEpisode {episode + 1}")
        
        while not done:
            env.render()  # 渲染当前状态
            
            # 将状态转换为one-hot编码
    
            
            # 使用训练好的策略选择动作
            with torch.no_grad():
                if np.random.random() < 0.0:
                    action = np.random.randint(0, 4)
                else:
                    state_tensor = torch.FloatTensor(state)
                    probs = agent.policy_net(state_tensor)
                    action = probs.argmax().item()  # 使用最可能的动作
            
            # 执行动作
            step_result = env.step(action)
            if len(step_result) == 4:
                next_state, reward, done, _ = step_result
            else:
                next_state, reward, terminated, truncated, _ = step_result
                done = terminated or truncated
            
            total_reward += reward
            steps += 1
            state = next_state
            
            # 添加小延迟使动作更容易观察
            time.sleep(0.01)
        
        print(f"Episode finished after {steps} steps. Total reward: {total_reward}")
    
    env.close()

# 在主程序最后添加：
if __name__ == "__main__":    
    # 训练完成后显示智能体行动
    print("\nVisualizing trained agent behavior...")
    env = gym.make('FlappyBird-v0',render_mode='human',use_lidar=False)
    visualize_agent(env, agent)


Visualizing trained agent behavior...

Episode 1
Episode finished after 7128 steps. Total reward: 880.9000000001037

Episode 2


In [100]:
env.close()