In [1]:
import tensorflow as tf
print('TensorFlow Version: {}'.format(tf.__version__))
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))

TensorFlow Version: 1.7.1
Default GPU Device: 


##### >**Note:** Make sure you have OpenAI Gym cloned. Then run this command `pip install -e gym/[all]`.

In [2]:
import numpy as np
import gym
env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')

In [3]:
def model_input(state_size):
    #states = tf.placeholder(tf.float32, [None, *state_size], name='states')
    states = tf.placeholder(tf.float32, [None, state_size], name='states')
    actions = tf.placeholder(tf.int32, [None], name='actions')
    next_states = tf.placeholder(tf.float32, [None, state_size], name='next_states')
    rewards = tf.placeholder(tf.float32, [None], name='rewards')
    dones = tf.placeholder(tf.float32, [None], name='dones')
    rates = tf.placeholder(tf.float32, [None], name='rates') # success rate
    return states, actions, next_states, rewards, dones, rates

In [4]:
def Act(states, action_size, hidden_size, reuse=False, alpha=0.1, training=False):
    with tf.variable_scope('Act', reuse=reuse):
        # First fully connected layer
        h1 = tf.layers.dense(inputs=states, units=hidden_size)
        bn1 = tf.layers.batch_normalization(h1, training=training)        
        nl1 = tf.maximum(alpha * bn1, bn1)
        
        # Second fully connected layer
        h2 = tf.layers.dense(inputs=nl1, units=hidden_size)
        bn2 = tf.layers.batch_normalization(h2, training=training)        
        nl2 = tf.maximum(alpha * bn2, bn2)
        
        # Output layer
        logits = tf.layers.dense(inputs=nl2, units=action_size)        
        return logits

In [5]:
def Env(states, actions, state_size, action_size, hidden_size, reuse=False, alpha=0.1, training=False):
    with tf.variable_scope('Env', reuse=reuse):
        # First fully connected layer
        h1 = tf.layers.dense(inputs=states, units=action_size)
        bn1 = tf.layers.batch_normalization(h1, training=training)        
        nl1 = tf.maximum(alpha * bn1, bn1)
        
        # Second fully connected layer
        nl1_fused = tf.concat(axis=1, values=[nl1, actions])
        h2 = tf.layers.dense(inputs=nl1_fused, units=hidden_size)
        bn2 = tf.layers.batch_normalization(h2, training=training)        
        nl2 = tf.maximum(alpha * bn2, bn2)
                
        # Output layer
        states_logits = tf.layers.dense(inputs=nl2, units=state_size, trainable=False)
        Qlogits = tf.layers.dense(inputs=nl2, units=1, trainable=False)
        return states_logits, Qlogits

In [7]:
def model_loss(state_size, action_size, hidden_size, gamma,
               states, actions, next_states, rewards, dones, rates):
    ################################################ a = act(s)
    actions_logits = Act(states=states, hidden_size=hidden_size, action_size=action_size)
    actions_labels = tf.one_hot(indices=actions, depth=action_size, dtype=actions_logits.dtype)
    aloss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=actions_logits, 
                                                                      labels=actions_labels))
    ################################################ s', r = env(s, a)
    ################################################ s', Q = env(s, a)
    ################################################ ~s', ~Q = env(s, ~a)
    e_next_states_logits, eQs = Env(actions=actions_labels, states=states, hidden_size=hidden_size, 
                                    action_size=action_size, state_size=state_size)
    a_next_states_logits, aQs = Env(actions=actions_logits, states=states, hidden_size=hidden_size, 
                                    action_size=action_size, state_size=state_size, reuse=True)
    next_states_labels = tf.nn.sigmoid(next_states)
    eloss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=e_next_states_logits, 
                                                                   labels=next_states_labels))
    eloss += -tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=a_next_states_logits, 
                                                                     labels=next_states_labels)) # maximize loss
    aloss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=a_next_states_logits, 
                                                                    labels=next_states_labels)) # minimize loss
    eQs_logits = tf.reshape(eQs, shape=[-1])
    aQs_logits = tf.reshape(aQs, shape=[-1])
    #################################################### s'', Q' = ~env(s', ~a')
    next_actions_logits = Act(states=next_states, hidden_size=hidden_size, action_size=action_size, reuse=True)
    _, aQs2 = Env(actions=next_actions_logits, states=next_states, hidden_size=hidden_size, 
                  action_size=action_size, state_size=state_size, reuse=True)
    aQs2_logits = tf.reshape(aQs2, shape=[-1]) * (1-dones)
    eloss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=eQs_logits, # GAN
                                                                    labels=rates)) # 0-1 real
    eloss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=(aQs_logits+aQs2_logits)/2, # GAN
                                                                    labels=tf.zeros_like(rates))) # min
    aloss2 += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=(aQs_logits+aQs2_logits)/2, # GAN
                                                                     labels=tf.ones_like(rates))) # max
    ###################################################### Q(s,a)= r + Q'(s',a') # max
    targetQs = rewards + (gamma * aQs2_logits)
    eloss += tf.reduce_mean(tf.square(eQs_logits - targetQs)) # real
    eloss += tf.reduce_mean(tf.square(aQs_logits - rewards)) # min Q
    aloss2 += tf.reduce_mean(tf.square(aQs_logits - targetQs)) # max Q
    eloss += tf.reduce_mean((aQs_logits+aQs2_logits)/2) # minimize Q
    aloss2 += -tf.reduce_mean((aQs_logits+aQs2_logits)/2) # maximize Q
    return actions_logits, aloss, eloss, aloss2

In [8]:
def model_opt(a_loss, e_loss, a_loss2, a_learning_rate, e_learning_rate):
    # Get weights and bias to update
    t_vars = tf.trainable_variables()
    a_vars = [var for var in t_vars if var.name.startswith('Act')]
    e_vars = [var for var in t_vars if var.name.startswith('Env')]

    # Optimize
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): # Required for batchnorm (BN)
        a_opt = tf.train.AdamOptimizer(a_learning_rate).minimize(a_loss, var_list=a_vars)
        e_opt = tf.train.AdamOptimizer(e_learning_rate).minimize(e_loss, var_list=e_vars)
        a_opt2 = tf.train.AdamOptimizer(a_learning_rate).minimize(a_loss2, var_list=a_vars)
    return a_opt, e_opt, a_opt2

In [9]:
class Model:
    def __init__(self, state_size, action_size, hidden_size, a_learning_rate, e_learning_rate, gamma):

        # Data of the Model: make the data available inside the framework
        self.states, self.actions, self.next_states, self.rewards, self.dones, self.rates = model_input(
            state_size=state_size)

        # Create the Model: calculating the loss and forwad pass
        self.actions_logits, self.a_loss, self.e_loss, self.a_loss2 = model_loss(
            state_size=state_size, action_size=action_size, hidden_size=hidden_size, gamma=gamma, # model init
            states=self.states, actions=self.actions, next_states=self.next_states, 
            rewards=self.rewards, dones=self.dones, rates=self.rates) # model input
        
        # Update the model: backward pass and backprop
        self.a_opt, self.e_opt, self.a_opt2 = model_opt(a_loss=self.a_loss, 
                                                        e_loss=self.e_loss,
                                                        a_loss2=self.a_loss2, 
                                                        a_learning_rate=a_learning_rate,
                                                        e_learning_rate=e_learning_rate)

In [10]:
from collections import deque
class Memory():
    def __init__(self, max_size = 1000):
        self.buffer = deque(maxlen=max_size) # data batch
#     def sample(self, batch_size):
#         idx = np.random.choice(np.arange(len(self.buffer)), size=batch_size, replace=False)
#         return [self.buffer[ii] for ii in idx]

## Hyperparameters

One of the more difficult aspects of reinforcememt learning are the large number of hyperparameters. Not only are we tuning the network, but we're tuning the simulation.

In [11]:
env.observation_space, env.action_space

(Box(4,), Discrete(2))

In [12]:
# Exploration parameters
explore_start = 1.0            # exploration probability at start
explore_stop = 0.01           # minimum exploration probability 
decay_rate = 0.0001            # exponential decay rate for exploration prob

# Network parameters
state_size = 4
action_size = 2
hidden_size = 4*2             # number of units in each Q-network hidden layer
a_learning_rate = 1e-4         # Q-network learning rate
e_learning_rate = 1e-4         # Q-network learning rate

# Memory parameters
memory_size = int(1e5)            # memory capacity
batch_size = int(1e3)             # experience mini-batch size
gamma=0.99

In [13]:
# Reset/init the graph/session
graph = tf.reset_default_graph()

# Init the model
model = Model(action_size=action_size, state_size=state_size, hidden_size=hidden_size, gamma=gamma,
              a_learning_rate=a_learning_rate, e_learning_rate=e_learning_rate)

# Init the memory
memory = Memory(max_size=memory_size)

In [14]:
state = env.reset()
total_reward = 0
num_step = 0
for _ in range(memory_size):
    action = env.action_space.sample()
    next_state, reward, done, _ = env.step(action)
    rate = -1
    memory.buffer.append([state, action, next_state, reward, float(done), rate])
    num_step += 1 # memory incremented
    total_reward += reward
    state = next_state
    if done is True:
        rate = total_reward/500
        for idx in range(num_step): # episode length
            if memory.buffer[-1-idx][-1] == -1:
                memory.buffer[-1-idx][-1] = rate
        state = env.reset()
        total_reward = 0 # reset
        num_step = 0 # reset

## Training the model

Below we'll train our agent. If you want to watch it train, uncomment the `env.render()` line. This is slow because it's rendering the frames slower than the network can train. But, it's cool to watch the agent get better at the game.

In [None]:
# Save/load the model and save for plotting
saver = tf.train.Saver()
episode_rewards_list, rewards_list = [], []
aloss_list, eloss_list, aloss2_list = [], [], []

# TF session for training
with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer())
    #saver.restore(sess, 'checkpoints/model.ckpt')    
    #saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    total_step = 0 # Explore or exploit parameter
    episode_reward = deque(maxlen=100) # 100 episodes for running average/running mean/window

    # Training episodes/epochs
    for ep in range(11111):
        aloss_batch, eloss_batch, aloss2_batch = [], [], []
        total_reward = 0
        state = env.reset()
        num_step = 0
        rate = -1

        # Training steps/batches
        while True:
            # Explore (env) or Exploit (model)
            total_step += 1
            explore_p = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * total_step) 
            if explore_p > np.random.rand():
                action = env.action_space.sample()
            else:
                action_logits = sess.run(model.actions_logits, feed_dict={model.states: state.reshape([1, -1])})
                action = np.argmax(action_logits)
            next_state, reward, done, _ = env.step(action)
            memory.buffer.append([state, action, next_state, reward, float(done), rate])
            num_step += 1 # momory added
            total_reward += reward
            state = next_state
            
            # Training with the maxrated minibatch
            batch = memory.buffer
            #for idx in range(memory_size// batch_size):
            while True:
                idx = np.random.choice(np.arange(memory_size// batch_size))
                states = np.array([each[0] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                actions = np.array([each[1] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                next_states = np.array([each[2] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                rewards = np.array([each[3] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                dones = np.array([each[4] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                rates = np.array([each[5] for each in batch])[idx*batch_size:(idx+1)*batch_size]
                states = states[rates >= np.max(rates)]
                actions = actions[rates >= np.max(rates)]
                next_states = next_states[rates >= np.max(rates)]
                rewards = rewards[rates >= np.max(rates)]
                dones = dones[rates >= np.max(rates)]
                rates = rates[rates >= np.max(rates)]
                if np.count_nonzero(dones) > 0 and len(dones) > 1 and np.max(rates) > 0:
                    break
            aloss, _ = sess.run([model.a_loss, model.a_opt],
                                  feed_dict = {model.states: states, 
                                               model.actions: actions,
                                               model.next_states: next_states,
                                               model.rewards: rewards,
                                               model.dones: dones,
                                               model.rates: rates})
            eloss, _ = sess.run([model.e_loss, model.e_opt],
                                  feed_dict = {model.states: states, 
                                               model.actions: actions,
                                               model.next_states: next_states,
                                               model.rewards: rewards,
                                               model.dones: dones,
                                               model.rates: rates})
            aloss2, _= sess.run([model.a_loss2, model.a_opt2], 
                                 feed_dict = {model.states: states, 
                                              model.actions: actions,
                                              model.next_states: next_states,
                                              model.rewards: rewards,
                                              model.dones: dones,
                                              model.rates: rates})
            # print(len(dones), np.count_nonzero(dones), np.max(rates))
            aloss_batch.append(aloss)
            eloss_batch.append(eloss)
            aloss2_batch.append(aloss2)
            if done is True:
                break
                
        # Rating the latest played episode
        rate = total_reward/500 # update rate at the end/ when episode is done
        for idx in range(num_step): # episode length
            if memory.buffer[-1-idx][-1] == -1: # double-check the landmark/marked indexes
                memory.buffer[-1-idx][-1] = rate # rate the trajectory/data

        # Print out
        episode_reward.append(total_reward)
        print('Episode:{}'.format(ep),
              'meanR:{:.4f}'.format(np.mean(episode_reward)),
              'R:{:.4f}'.format(total_reward),
              'rate:{:.4f}'.format(rate),
              'aloss:{:.4f}'.format(np.mean(aloss_batch)),
              'eloss:{:.4f}'.format(np.mean(eloss_batch)),
              'aloss2:{:.4f}'.format(np.mean(aloss2_batch)),
              'exploreP:{:.4f}'.format(explore_p))

        # Ploting out
        episode_rewards_list.append([ep, np.mean(episode_reward)])
        rewards_list.append([ep, total_reward])
        aloss_list.append([ep, np.mean(aloss_batch)])
        eloss_list.append([ep, np.mean(eloss_batch)])
        aloss2_list.append([ep, np.mean(aloss2_batch)])
        
        # Break episode/epoch loop
        ## Option 1: Solve the First Version
        #The task is episodic, and in order to solve the environment, 
        #your agent must get an average score of +30 over 100 consecutive episodes.
        if np.mean(episode_reward) >= 500:
            break
            
    # At the end of all training episodes/epochs
    saver.save(sess, 'checkpoints/model.ckpt')

Episode:0 meanR:20.0000 R:20.0000 rate:0.0400 aloss:0.7096 eloss:3.5596 aloss2:2.3476 exploreP:0.9980
Episode:1 meanR:20.5000 R:21.0000 rate:0.0420 aloss:0.6977 eloss:3.6005 aloss2:2.3310 exploreP:0.9959
Episode:2 meanR:18.0000 R:13.0000 rate:0.0260 aloss:0.7075 eloss:3.5387 aloss2:2.3253 exploreP:0.9947
Episode:3 meanR:20.0000 R:26.0000 rate:0.0520 aloss:0.7058 eloss:3.5654 aloss2:2.3059 exploreP:0.9921
Episode:4 meanR:19.6000 R:18.0000 rate:0.0360 aloss:0.7132 eloss:3.5379 aloss2:2.2931 exploreP:0.9903
Episode:5 meanR:23.0000 R:40.0000 rate:0.0800 aloss:0.7033 eloss:3.5678 aloss2:2.2756 exploreP:0.9864
Episode:6 meanR:21.4286 R:12.0000 rate:0.0240 aloss:0.6987 eloss:3.5159 aloss2:2.2951 exploreP:0.9853
Episode:7 meanR:20.7500 R:16.0000 rate:0.0320 aloss:0.7110 eloss:3.5731 aloss2:2.2345 exploreP:0.9837
Episode:8 meanR:21.0000 R:23.0000 rate:0.0460 aloss:0.6993 eloss:3.4712 aloss2:2.2814 exploreP:0.9815
Episode:9 meanR:23.1000 R:42.0000 rate:0.0840 aloss:0.6983 eloss:3.4844 aloss2:2.2

Episode:80 meanR:24.9630 R:20.0000 rate:0.0400 aloss:0.6895 eloss:3.1338 aloss2:2.2443 exploreP:0.8188
Episode:81 meanR:24.8902 R:19.0000 rate:0.0380 aloss:0.6872 eloss:3.1618 aloss2:2.1847 exploreP:0.8172
Episode:82 meanR:24.7831 R:16.0000 rate:0.0320 aloss:0.6860 eloss:3.1555 aloss2:2.2287 exploreP:0.8159
Episode:83 meanR:24.6548 R:14.0000 rate:0.0280 aloss:0.6918 eloss:3.1356 aloss2:2.2118 exploreP:0.8148
Episode:84 meanR:24.6000 R:20.0000 rate:0.0400 aloss:0.6902 eloss:3.1273 aloss2:2.2454 exploreP:0.8132
Episode:85 meanR:24.7093 R:34.0000 rate:0.0680 aloss:0.6888 eloss:3.1194 aloss2:2.2690 exploreP:0.8105
Episode:86 meanR:24.7241 R:26.0000 rate:0.0520 aloss:0.6851 eloss:3.1317 aloss2:2.2572 exploreP:0.8084
Episode:87 meanR:24.5909 R:13.0000 rate:0.0260 aloss:0.6881 eloss:3.0986 aloss2:2.2683 exploreP:0.8074
Episode:88 meanR:24.5056 R:17.0000 rate:0.0340 aloss:0.6878 eloss:3.1345 aloss2:2.2133 exploreP:0.8060
Episode:89 meanR:24.4778 R:22.0000 rate:0.0440 aloss:0.6898 eloss:3.1070 

Episode:159 meanR:33.5200 R:33.0000 rate:0.0660 aloss:0.6719 eloss:3.2589 aloss2:2.2463 exploreP:0.6284
Episode:160 meanR:34.1000 R:69.0000 rate:0.1380 aloss:0.6813 eloss:3.2871 aloss2:2.2261 exploreP:0.6242
Episode:161 meanR:34.6900 R:75.0000 rate:0.1500 aloss:0.6923 eloss:3.3132 aloss2:2.2196 exploreP:0.6196
Episode:162 meanR:35.0100 R:71.0000 rate:0.1420 aloss:0.7153 eloss:3.3293 aloss2:2.2147 exploreP:0.6153
Episode:163 meanR:35.7400 R:113.0000 rate:0.2260 aloss:0.7146 eloss:3.3201 aloss2:2.2283 exploreP:0.6085
Episode:164 meanR:35.4800 R:38.0000 rate:0.0760 aloss:0.6834 eloss:3.2943 aloss2:2.2362 exploreP:0.6062
Episode:165 meanR:36.7400 R:153.0000 rate:0.3060 aloss:0.7059 eloss:3.3109 aloss2:2.2399 exploreP:0.5972
Episode:166 meanR:36.9500 R:80.0000 rate:0.1600 aloss:0.7038 eloss:3.2894 aloss2:2.2598 exploreP:0.5925
Episode:167 meanR:36.7000 R:16.0000 rate:0.0320 aloss:0.7268 eloss:3.3257 aloss2:2.2165 exploreP:0.5916
Episode:168 meanR:37.3900 R:84.0000 rate:0.1680 aloss:0.7044 e

Episode:238 meanR:93.1700 R:99.0000 rate:0.1980 aloss:0.7999 eloss:3.3360 aloss2:2.2449 exploreP:0.2786
Episode:239 meanR:92.9600 R:26.0000 rate:0.0520 aloss:0.7421 eloss:3.3367 aloss2:2.2634 exploreP:0.2779
Episode:240 meanR:93.2500 R:128.0000 rate:0.2560 aloss:0.7562 eloss:3.3534 aloss2:2.2583 exploreP:0.2745
Episode:241 meanR:92.9700 R:37.0000 rate:0.0740 aloss:0.7547 eloss:3.3451 aloss2:2.2592 exploreP:0.2735
Episode:242 meanR:93.9900 R:130.0000 rate:0.2600 aloss:0.7576 eloss:3.3679 aloss2:2.2556 exploreP:0.2701
Episode:243 meanR:93.9500 R:30.0000 rate:0.0600 aloss:0.7360 eloss:3.3648 aloss2:2.2667 exploreP:0.2693
Episode:244 meanR:94.4600 R:108.0000 rate:0.2160 aloss:0.7332 eloss:3.3426 aloss2:2.2779 exploreP:0.2665
Episode:245 meanR:95.6500 R:146.0000 rate:0.2920 aloss:0.7714 eloss:3.3442 aloss2:2.2602 exploreP:0.2628
Episode:246 meanR:96.8900 R:139.0000 rate:0.2780 aloss:0.7693 eloss:3.3451 aloss2:2.2843 exploreP:0.2593
Episode:247 meanR:97.4400 R:144.0000 rate:0.2880 aloss:0.76

Episode:317 meanR:71.3900 R:17.0000 rate:0.0340 aloss:0.7330 eloss:3.3287 aloss2:2.2728 exploreP:0.1785
Episode:318 meanR:70.7200 R:47.0000 rate:0.0940 aloss:0.7494 eloss:3.3411 aloss2:2.2738 exploreP:0.1777
Episode:319 meanR:69.3000 R:24.0000 rate:0.0480 aloss:0.7297 eloss:3.3940 aloss2:2.2390 exploreP:0.1773
Episode:320 meanR:68.1500 R:15.0000 rate:0.0300 aloss:0.7315 eloss:3.3322 aloss2:2.2785 exploreP:0.1771
Episode:321 meanR:66.3600 R:21.0000 rate:0.0420 aloss:0.7223 eloss:3.3531 aloss2:2.2560 exploreP:0.1767
Episode:322 meanR:64.8600 R:28.0000 rate:0.0560 aloss:0.7317 eloss:3.3458 aloss2:2.2638 exploreP:0.1763
Episode:323 meanR:63.5300 R:33.0000 rate:0.0660 aloss:0.7665 eloss:3.3255 aloss2:2.2913 exploreP:0.1757
Episode:324 meanR:63.4100 R:23.0000 rate:0.0460 aloss:0.7489 eloss:3.3236 aloss2:2.2820 exploreP:0.1753
Episode:325 meanR:63.3000 R:31.0000 rate:0.0620 aloss:0.7725 eloss:3.3409 aloss2:2.2725 exploreP:0.1748
Episode:326 meanR:61.8600 R:29.0000 rate:0.0580 aloss:0.7444 elo

Episode:396 meanR:46.0700 R:82.0000 rate:0.1640 aloss:0.7655 eloss:3.2370 aloss2:2.2835 exploreP:0.1216
Episode:397 meanR:46.4400 R:59.0000 rate:0.1180 aloss:0.7313 eloss:3.2420 aloss2:2.2770 exploreP:0.1210
Episode:398 meanR:47.3000 R:113.0000 rate:0.2260 aloss:0.7393 eloss:3.2472 aloss2:2.2538 exploreP:0.1197
Episode:399 meanR:48.1400 R:111.0000 rate:0.2220 aloss:0.7276 eloss:3.2627 aloss2:2.2403 exploreP:0.1185
Episode:400 meanR:48.5100 R:64.0000 rate:0.1280 aloss:0.7357 eloss:3.2492 aloss2:2.2573 exploreP:0.1178
Episode:401 meanR:49.2100 R:95.0000 rate:0.1900 aloss:0.7573 eloss:3.2429 aloss2:2.2548 exploreP:0.1168
Episode:402 meanR:49.8300 R:73.0000 rate:0.1460 aloss:0.7345 eloss:3.1889 aloss2:2.2817 exploreP:0.1160
Episode:403 meanR:50.5900 R:114.0000 rate:0.2280 aloss:0.7286 eloss:3.1777 aloss2:2.2907 exploreP:0.1148
Episode:404 meanR:51.7400 R:146.0000 rate:0.2920 aloss:0.7059 eloss:3.2269 aloss2:2.2384 exploreP:0.1133
Episode:405 meanR:52.2900 R:73.0000 rate:0.1460 aloss:0.7404

Episode:475 meanR:71.5100 R:185.0000 rate:0.3700 aloss:0.7318 eloss:3.2795 aloss2:2.2287 exploreP:0.0767
Episode:476 meanR:72.6600 R:169.0000 rate:0.3380 aloss:0.7404 eloss:3.2799 aloss2:2.2292 exploreP:0.0756
Episode:477 meanR:72.8700 R:75.0000 rate:0.1500 aloss:0.7321 eloss:3.2863 aloss2:2.2287 exploreP:0.0751
Episode:478 meanR:73.8900 R:185.0000 rate:0.3700 aloss:0.7495 eloss:3.3138 aloss2:2.2066 exploreP:0.0739
Episode:479 meanR:74.5500 R:159.0000 rate:0.3180 aloss:0.7270 eloss:3.3086 aloss2:2.2243 exploreP:0.0729
Episode:480 meanR:75.5900 R:201.0000 rate:0.4020 aloss:0.8059 eloss:3.3356 aloss2:2.2089 exploreP:0.0716
Episode:481 meanR:76.1200 R:185.0000 rate:0.3700 aloss:0.7607 eloss:3.3154 aloss2:2.2373 exploreP:0.0705
Episode:482 meanR:75.8300 R:46.0000 rate:0.0920 aloss:0.7102 eloss:3.2954 aloss2:2.2405 exploreP:0.0702
Episode:483 meanR:76.4800 R:191.0000 rate:0.3820 aloss:0.7530 eloss:3.3143 aloss2:2.2226 exploreP:0.0691
Episode:484 meanR:77.3800 R:187.0000 rate:0.3740 aloss:0.

Episode:554 meanR:64.1400 R:34.0000 rate:0.0680 aloss:0.8315 eloss:3.3212 aloss2:2.2970 exploreP:0.0513
Episode:555 meanR:63.5700 R:17.0000 rate:0.0340 aloss:0.8140 eloss:3.3298 aloss2:2.2616 exploreP:0.0512
Episode:556 meanR:63.2100 R:18.0000 rate:0.0360 aloss:0.8144 eloss:3.3298 aloss2:2.3070 exploreP:0.0512
Episode:557 meanR:63.0100 R:36.0000 rate:0.0720 aloss:0.8127 eloss:3.3103 aloss2:2.3029 exploreP:0.0510
Episode:558 meanR:62.6600 R:15.0000 rate:0.0300 aloss:0.8441 eloss:3.3530 aloss2:2.2671 exploreP:0.0510
Episode:559 meanR:62.2500 R:20.0000 rate:0.0400 aloss:0.7911 eloss:3.3258 aloss2:2.2991 exploreP:0.0509
Episode:560 meanR:61.9100 R:20.0000 rate:0.0400 aloss:0.8085 eloss:3.3083 aloss2:2.3318 exploreP:0.0508
Episode:561 meanR:61.6800 R:19.0000 rate:0.0380 aloss:0.8169 eloss:3.3290 aloss2:2.3150 exploreP:0.0507
Episode:562 meanR:61.1000 R:14.0000 rate:0.0280 aloss:0.8440 eloss:3.3130 aloss2:2.3209 exploreP:0.0507
Episode:563 meanR:60.8100 R:18.0000 rate:0.0360 aloss:0.8389 elo

Episode:633 meanR:27.7900 R:13.0000 rate:0.0260 aloss:0.7968 eloss:3.3529 aloss2:2.2766 exploreP:0.0438
Episode:634 meanR:27.6600 R:17.0000 rate:0.0340 aloss:0.7878 eloss:3.3406 aloss2:2.2409 exploreP:0.0438
Episode:635 meanR:27.5500 R:16.0000 rate:0.0320 aloss:0.7904 eloss:3.3491 aloss2:2.2753 exploreP:0.0437
Episode:636 meanR:27.3300 R:17.0000 rate:0.0340 aloss:0.7882 eloss:3.3366 aloss2:2.2836 exploreP:0.0437
Episode:637 meanR:26.9600 R:13.0000 rate:0.0260 aloss:0.7930 eloss:3.3103 aloss2:2.3056 exploreP:0.0436
Episode:638 meanR:26.8900 R:15.0000 rate:0.0300 aloss:0.7895 eloss:3.3418 aloss2:2.2819 exploreP:0.0436
Episode:639 meanR:26.0500 R:15.0000 rate:0.0300 aloss:0.7614 eloss:3.3364 aloss2:2.2829 exploreP:0.0435
Episode:640 meanR:25.6700 R:16.0000 rate:0.0320 aloss:0.7900 eloss:3.3229 aloss2:2.3015 exploreP:0.0435
Episode:641 meanR:25.4500 R:15.0000 rate:0.0300 aloss:0.7876 eloss:3.3405 aloss2:2.2428 exploreP:0.0434
Episode:642 meanR:25.2000 R:15.0000 rate:0.0300 aloss:0.8223 elo

Episode:712 meanR:22.5000 R:14.0000 rate:0.0280 aloss:0.7561 eloss:3.3091 aloss2:2.2828 exploreP:0.0382
Episode:713 meanR:22.3700 R:16.0000 rate:0.0320 aloss:0.7661 eloss:3.2967 aloss2:2.2881 exploreP:0.0381
Episode:714 meanR:22.2200 R:19.0000 rate:0.0380 aloss:0.7516 eloss:3.3260 aloss2:2.2676 exploreP:0.0381
Episode:715 meanR:22.0400 R:13.0000 rate:0.0260 aloss:0.7628 eloss:3.2973 aloss2:2.2889 exploreP:0.0380
Episode:716 meanR:21.9000 R:21.0000 rate:0.0420 aloss:0.7713 eloss:3.2832 aloss2:2.2906 exploreP:0.0380
Episode:717 meanR:21.8800 R:14.0000 rate:0.0280 aloss:0.7630 eloss:3.2975 aloss2:2.2906 exploreP:0.0379
Episode:718 meanR:21.9100 R:22.0000 rate:0.0440 aloss:0.7454 eloss:3.3048 aloss2:2.2696 exploreP:0.0379
Episode:719 meanR:21.8600 R:14.0000 rate:0.0280 aloss:0.7707 eloss:3.2950 aloss2:2.2911 exploreP:0.0378
Episode:720 meanR:22.0100 R:30.0000 rate:0.0600 aloss:0.7495 eloss:3.3196 aloss2:2.2588 exploreP:0.0377
Episode:721 meanR:22.2100 R:36.0000 rate:0.0720 aloss:0.7392 elo

Episode:791 meanR:35.8400 R:41.0000 rate:0.0820 aloss:0.7348 eloss:3.3026 aloss2:2.2498 exploreP:0.0304
Episode:792 meanR:35.8900 R:41.0000 rate:0.0820 aloss:0.7262 eloss:3.3079 aloss2:2.2662 exploreP:0.0303
Episode:793 meanR:36.1600 R:46.0000 rate:0.0920 aloss:0.7692 eloss:3.3335 aloss2:2.2299 exploreP:0.0302
Episode:794 meanR:36.3700 R:41.0000 rate:0.0820 aloss:0.7517 eloss:3.3235 aloss2:2.2536 exploreP:0.0302
Episode:795 meanR:36.5700 R:40.0000 rate:0.0800 aloss:0.7606 eloss:3.3162 aloss2:2.2556 exploreP:0.0301
Episode:796 meanR:36.8400 R:44.0000 rate:0.0880 aloss:0.6953 eloss:3.3125 aloss2:2.2818 exploreP:0.0300
Episode:797 meanR:37.0900 R:45.0000 rate:0.0900 aloss:0.7273 eloss:3.3045 aloss2:2.2778 exploreP:0.0299
Episode:798 meanR:37.3300 R:40.0000 rate:0.0800 aloss:0.7022 eloss:3.3173 aloss2:2.2661 exploreP:0.0298
Episode:799 meanR:37.8000 R:62.0000 rate:0.1240 aloss:0.7427 eloss:3.3162 aloss2:2.2699 exploreP:0.0297
Episode:800 meanR:38.2900 R:68.0000 rate:0.1360 aloss:0.7354 elo

Episode:870 meanR:42.7700 R:26.0000 rate:0.0520 aloss:0.7308 eloss:3.3038 aloss2:2.3014 exploreP:0.0247
Episode:871 meanR:42.8800 R:50.0000 rate:0.1000 aloss:0.7365 eloss:3.3110 aloss2:2.2925 exploreP:0.0246
Episode:872 meanR:43.0900 R:65.0000 rate:0.1300 aloss:0.7239 eloss:3.3182 aloss2:2.2953 exploreP:0.0245
Episode:873 meanR:43.5000 R:87.0000 rate:0.1740 aloss:0.7152 eloss:3.3279 aloss2:2.2894 exploreP:0.0244
Episode:874 meanR:43.5200 R:43.0000 rate:0.0860 aloss:0.7354 eloss:3.3094 aloss2:2.2955 exploreP:0.0243
Episode:875 meanR:44.0600 R:92.0000 rate:0.1840 aloss:0.7317 eloss:3.3120 aloss2:2.3057 exploreP:0.0242
Episode:876 meanR:44.1000 R:51.0000 rate:0.1020 aloss:0.6705 eloss:3.3109 aloss2:2.3134 exploreP:0.0241
Episode:877 meanR:43.9900 R:46.0000 rate:0.0920 aloss:0.7284 eloss:3.2893 aloss2:2.3266 exploreP:0.0240
Episode:878 meanR:43.9800 R:44.0000 rate:0.0880 aloss:0.7427 eloss:3.2908 aloss2:2.3117 exploreP:0.0240
Episode:879 meanR:44.5500 R:101.0000 rate:0.2020 aloss:0.7113 el

Episode:949 meanR:43.5800 R:48.0000 rate:0.0960 aloss:0.6902 eloss:3.2572 aloss2:2.2950 exploreP:0.0200
Episode:950 meanR:44.0900 R:72.0000 rate:0.1440 aloss:0.6968 eloss:3.2606 aloss2:2.2821 exploreP:0.0200
Episode:951 meanR:44.2300 R:47.0000 rate:0.0940 aloss:0.6651 eloss:3.2457 aloss2:2.2739 exploreP:0.0199
Episode:952 meanR:44.5200 R:47.0000 rate:0.0940 aloss:0.6875 eloss:3.2563 aloss2:2.2631 exploreP:0.0199
Episode:953 meanR:44.7900 R:49.0000 rate:0.0980 aloss:0.7028 eloss:3.2727 aloss2:2.2453 exploreP:0.0198
Episode:954 meanR:45.0700 R:50.0000 rate:0.1000 aloss:0.6599 eloss:3.3076 aloss2:2.2247 exploreP:0.0198
Episode:955 meanR:45.1100 R:37.0000 rate:0.0740 aloss:0.7247 eloss:3.3316 aloss2:2.2036 exploreP:0.0197
Episode:956 meanR:45.2900 R:45.0000 rate:0.0900 aloss:0.6936 eloss:3.3328 aloss2:2.2141 exploreP:0.0197
Episode:957 meanR:45.4600 R:39.0000 rate:0.0780 aloss:0.7261 eloss:3.3227 aloss2:2.1978 exploreP:0.0197
Episode:958 meanR:45.6100 R:41.0000 rate:0.0820 aloss:0.7212 elo

Episode:1028 meanR:53.3600 R:41.0000 rate:0.0820 aloss:0.6996 eloss:3.3606 aloss2:2.2515 exploreP:0.0167
Episode:1029 meanR:53.2100 R:41.0000 rate:0.0820 aloss:0.6904 eloss:3.3521 aloss2:2.2553 exploreP:0.0167
Episode:1030 meanR:49.7700 R:51.0000 rate:0.1020 aloss:0.7092 eloss:3.3479 aloss2:2.2496 exploreP:0.0166
Episode:1031 meanR:49.8000 R:53.0000 rate:0.1060 aloss:0.7006 eloss:3.3754 aloss2:2.2505 exploreP:0.0166
Episode:1032 meanR:54.2400 R:478.0000 rate:0.9560 aloss:0.7007 eloss:3.3138 aloss2:2.2841 exploreP:0.0163
Episode:1033 meanR:55.2500 R:126.0000 rate:0.2520 aloss:0.6757 eloss:3.2886 aloss2:2.2811 exploreP:0.0162
Episode:1034 meanR:57.3700 R:242.0000 rate:0.4840 aloss:0.7131 eloss:3.3045 aloss2:2.2382 exploreP:0.0161
Episode:1035 meanR:57.8900 R:73.0000 rate:0.1460 aloss:0.7357 eloss:3.3135 aloss2:2.2384 exploreP:0.0160
Episode:1036 meanR:58.2100 R:57.0000 rate:0.1140 aloss:0.7221 eloss:3.2988 aloss2:2.2454 exploreP:0.0160
Episode:1037 meanR:58.3300 R:46.0000 rate:0.0920 alo

Episode:1106 meanR:111.6500 R:39.0000 rate:0.0780 aloss:0.6744 eloss:3.2632 aloss2:2.2507 exploreP:0.0124
Episode:1107 meanR:111.7500 R:47.0000 rate:0.0940 aloss:0.6519 eloss:3.2735 aloss2:2.2664 exploreP:0.0124
Episode:1108 meanR:111.3200 R:37.0000 rate:0.0740 aloss:0.6884 eloss:3.2522 aloss2:2.2548 exploreP:0.0124
Episode:1109 meanR:111.4100 R:53.0000 rate:0.1060 aloss:0.6598 eloss:3.2459 aloss2:2.2564 exploreP:0.0124
Episode:1110 meanR:110.8500 R:64.0000 rate:0.1280 aloss:0.6758 eloss:3.2605 aloss2:2.2542 exploreP:0.0123
Episode:1111 meanR:110.7700 R:56.0000 rate:0.1120 aloss:0.6504 eloss:3.2619 aloss2:2.2512 exploreP:0.0123
Episode:1112 meanR:111.6300 R:133.0000 rate:0.2660 aloss:0.6714 eloss:3.2940 aloss2:2.2345 exploreP:0.0123
Episode:1113 meanR:112.3100 R:93.0000 rate:0.1860 aloss:0.6762 eloss:3.2958 aloss2:2.2305 exploreP:0.0123
Episode:1114 meanR:112.7700 R:69.0000 rate:0.1380 aloss:0.6740 eloss:3.2847 aloss2:2.2416 exploreP:0.0123
Episode:1115 meanR:113.4500 R:86.0000 rate:0.

Episode:1183 meanR:164.4300 R:206.0000 rate:0.4120 aloss:0.5794 eloss:3.3041 aloss2:2.2159 exploreP:0.0106
Episode:1184 meanR:165.2900 R:172.0000 rate:0.3440 aloss:0.5495 eloss:3.3018 aloss2:2.2068 exploreP:0.0106
Episode:1185 meanR:165.2700 R:77.0000 rate:0.1540 aloss:0.5909 eloss:3.3140 aloss2:2.1871 exploreP:0.0106
Episode:1186 meanR:165.6700 R:103.0000 rate:0.2060 aloss:0.6017 eloss:3.2965 aloss2:2.1990 exploreP:0.0106
Episode:1187 meanR:163.6800 R:166.0000 rate:0.3320 aloss:0.5660 eloss:3.3023 aloss2:2.2140 exploreP:0.0106
Episode:1188 meanR:164.4700 R:144.0000 rate:0.2880 aloss:0.5784 eloss:3.3088 aloss2:2.2056 exploreP:0.0106
Episode:1189 meanR:163.8900 R:153.0000 rate:0.3060 aloss:0.5925 eloss:3.3032 aloss2:2.2141 exploreP:0.0105
Episode:1190 meanR:163.4200 R:153.0000 rate:0.3060 aloss:0.5855 eloss:3.3154 aloss2:2.2226 exploreP:0.0105
Episode:1191 meanR:164.5700 R:198.0000 rate:0.3960 aloss:0.5801 eloss:3.3058 aloss2:2.2218 exploreP:0.0105
Episode:1192 meanR:165.3300 R:141.0000

Episode:1260 meanR:173.1900 R:186.0000 rate:0.3720 aloss:0.5511 eloss:3.3180 aloss2:2.2000 exploreP:0.0102
Episode:1261 meanR:170.9800 R:177.0000 rate:0.3540 aloss:0.5181 eloss:3.3110 aloss2:2.1942 exploreP:0.0102
Episode:1262 meanR:167.8800 R:190.0000 rate:0.3800 aloss:0.5470 eloss:3.3197 aloss2:2.1857 exploreP:0.0102
Episode:1263 meanR:165.1200 R:180.0000 rate:0.3600 aloss:0.5513 eloss:3.3405 aloss2:2.1939 exploreP:0.0102
Episode:1264 meanR:164.1800 R:163.0000 rate:0.3260 aloss:0.5466 eloss:3.3131 aloss2:2.2025 exploreP:0.0102
Episode:1265 meanR:160.5800 R:140.0000 rate:0.2800 aloss:0.5747 eloss:3.3267 aloss2:2.1828 exploreP:0.0102
Episode:1266 meanR:159.3300 R:163.0000 rate:0.3260 aloss:0.5470 eloss:3.3129 aloss2:2.1965 exploreP:0.0102
Episode:1267 meanR:157.2700 R:172.0000 rate:0.3440 aloss:0.5345 eloss:3.3069 aloss2:2.1977 exploreP:0.0102
Episode:1268 meanR:157.0600 R:228.0000 rate:0.4560 aloss:0.5677 eloss:3.3215 aloss2:2.1852 exploreP:0.0102
Episode:1269 meanR:153.5500 R:149.000

Episode:1337 meanR:166.2300 R:149.0000 rate:0.2980 aloss:0.4838 eloss:3.3202 aloss2:2.1475 exploreP:0.0101
Episode:1338 meanR:167.6200 R:255.0000 rate:0.5100 aloss:0.4861 eloss:3.2991 aloss2:2.1551 exploreP:0.0101
Episode:1339 meanR:167.8900 R:156.0000 rate:0.3120 aloss:0.5117 eloss:3.2839 aloss2:2.1583 exploreP:0.0101
Episode:1340 meanR:168.0900 R:163.0000 rate:0.3260 aloss:0.4716 eloss:3.2999 aloss2:2.1603 exploreP:0.0101
Episode:1341 meanR:168.7300 R:193.0000 rate:0.3860 aloss:0.4895 eloss:3.2882 aloss2:2.1759 exploreP:0.0100
Episode:1342 meanR:169.0300 R:161.0000 rate:0.3220 aloss:0.4629 eloss:3.2978 aloss2:2.1621 exploreP:0.0100
Episode:1343 meanR:169.3700 R:169.0000 rate:0.3380 aloss:0.4733 eloss:3.2872 aloss2:2.1591 exploreP:0.0100
Episode:1344 meanR:168.7900 R:173.0000 rate:0.3460 aloss:0.4573 eloss:3.2765 aloss2:2.1543 exploreP:0.0100
Episode:1345 meanR:168.9900 R:161.0000 rate:0.3220 aloss:0.4824 eloss:3.2898 aloss2:2.1601 exploreP:0.0100
Episode:1346 meanR:168.4500 R:162.000

Episode:1414 meanR:163.4200 R:178.0000 rate:0.3560 aloss:0.4252 eloss:3.2796 aloss2:2.1375 exploreP:0.0100
Episode:1415 meanR:163.5100 R:148.0000 rate:0.2960 aloss:0.4274 eloss:3.2984 aloss2:2.1411 exploreP:0.0100
Episode:1416 meanR:162.7700 R:80.0000 rate:0.1600 aloss:0.4145 eloss:3.3013 aloss2:2.1542 exploreP:0.0100
Episode:1417 meanR:162.1500 R:149.0000 rate:0.2980 aloss:0.4103 eloss:3.2936 aloss2:2.1695 exploreP:0.0100
Episode:1418 meanR:163.1100 R:264.0000 rate:0.5280 aloss:0.4050 eloss:3.2860 aloss2:2.1481 exploreP:0.0100
Episode:1419 meanR:163.0000 R:134.0000 rate:0.2680 aloss:0.4145 eloss:3.2872 aloss2:2.1513 exploreP:0.0100
Episode:1420 meanR:162.8500 R:196.0000 rate:0.3920 aloss:0.4123 eloss:3.2905 aloss2:2.1566 exploreP:0.0100
Episode:1421 meanR:162.1000 R:59.0000 rate:0.1180 aloss:0.4258 eloss:3.2930 aloss2:2.1551 exploreP:0.0100
Episode:1422 meanR:162.0600 R:146.0000 rate:0.2920 aloss:0.4176 eloss:3.3018 aloss2:2.1667 exploreP:0.0100
Episode:1423 meanR:162.1100 R:152.0000 

Episode:1491 meanR:132.5100 R:57.0000 rate:0.1140 aloss:0.3692 eloss:3.2906 aloss2:2.1523 exploreP:0.0100
Episode:1492 meanR:132.1300 R:68.0000 rate:0.1360 aloss:0.3535 eloss:3.2970 aloss2:2.1636 exploreP:0.0100
Episode:1493 meanR:131.2500 R:69.0000 rate:0.1380 aloss:0.3761 eloss:3.2762 aloss2:2.1972 exploreP:0.0100
Episode:1494 meanR:129.8700 R:53.0000 rate:0.1060 aloss:0.3434 eloss:3.2895 aloss2:2.1441 exploreP:0.0100
Episode:1495 meanR:128.9400 R:65.0000 rate:0.1300 aloss:0.3496 eloss:3.2931 aloss2:2.1504 exploreP:0.0100
Episode:1496 meanR:127.8700 R:65.0000 rate:0.1300 aloss:0.3705 eloss:3.2884 aloss2:2.1674 exploreP:0.0100
Episode:1497 meanR:127.0500 R:53.0000 rate:0.1060 aloss:0.3755 eloss:3.2874 aloss2:2.1409 exploreP:0.0100
Episode:1498 meanR:126.6700 R:141.0000 rate:0.2820 aloss:0.3709 eloss:3.2943 aloss2:2.1550 exploreP:0.0100
Episode:1499 meanR:125.7800 R:75.0000 rate:0.1500 aloss:0.3466 eloss:3.2913 aloss2:2.1506 exploreP:0.0100
Episode:1500 meanR:124.7400 R:59.0000 rate:0.

Episode:1568 meanR:111.2000 R:166.0000 rate:0.3320 aloss:0.3706 eloss:3.3270 aloss2:2.1622 exploreP:0.0100
Episode:1569 meanR:112.4600 R:192.0000 rate:0.3840 aloss:0.3775 eloss:3.3322 aloss2:2.1715 exploreP:0.0100
Episode:1570 meanR:113.2400 R:149.0000 rate:0.2980 aloss:0.3764 eloss:3.3245 aloss2:2.1667 exploreP:0.0100
Episode:1571 meanR:113.3700 R:66.0000 rate:0.1320 aloss:0.3710 eloss:3.3228 aloss2:2.1584 exploreP:0.0100
Episode:1572 meanR:113.7100 R:197.0000 rate:0.3940 aloss:0.3829 eloss:3.3278 aloss2:2.1670 exploreP:0.0100
Episode:1573 meanR:114.4500 R:152.0000 rate:0.3040 aloss:0.3720 eloss:3.3271 aloss2:2.1671 exploreP:0.0100
Episode:1574 meanR:115.5200 R:178.0000 rate:0.3560 aloss:0.3755 eloss:3.3216 aloss2:2.1586 exploreP:0.0100
Episode:1575 meanR:116.6400 R:174.0000 rate:0.3480 aloss:0.3712 eloss:3.3275 aloss2:2.1558 exploreP:0.0100
Episode:1576 meanR:117.3500 R:137.0000 rate:0.2740 aloss:0.3665 eloss:3.3244 aloss2:2.1554 exploreP:0.0100
Episode:1577 meanR:116.7500 R:152.0000

Episode:1645 meanR:148.2600 R:148.0000 rate:0.2960 aloss:0.3604 eloss:3.3236 aloss2:2.1316 exploreP:0.0100
Episode:1646 meanR:148.5100 R:159.0000 rate:0.3180 aloss:0.3709 eloss:3.3353 aloss2:2.1288 exploreP:0.0100
Episode:1647 meanR:149.5800 R:168.0000 rate:0.3360 aloss:0.3589 eloss:3.3254 aloss2:2.1384 exploreP:0.0100
Episode:1648 meanR:150.0700 R:200.0000 rate:0.4000 aloss:0.3783 eloss:3.3212 aloss2:2.1529 exploreP:0.0100
Episode:1649 meanR:150.7200 R:231.0000 rate:0.4620 aloss:0.3799 eloss:3.3137 aloss2:2.1469 exploreP:0.0100
Episode:1650 meanR:149.9800 R:137.0000 rate:0.2740 aloss:0.3605 eloss:3.3066 aloss2:2.1268 exploreP:0.0100
Episode:1651 meanR:149.4900 R:167.0000 rate:0.3340 aloss:0.3701 eloss:3.3154 aloss2:2.1291 exploreP:0.0100
Episode:1652 meanR:149.4200 R:179.0000 rate:0.3580 aloss:0.3520 eloss:3.3211 aloss2:2.1253 exploreP:0.0100
Episode:1653 meanR:149.4900 R:159.0000 rate:0.3180 aloss:0.3673 eloss:3.3101 aloss2:2.1282 exploreP:0.0100
Episode:1654 meanR:150.8600 R:205.000

Episode:1722 meanR:177.8600 R:91.0000 rate:0.1820 aloss:0.3825 eloss:3.2784 aloss2:2.1309 exploreP:0.0100
Episode:1723 meanR:177.6200 R:147.0000 rate:0.2940 aloss:0.3881 eloss:3.2855 aloss2:2.1231 exploreP:0.0100
Episode:1724 meanR:177.1200 R:158.0000 rate:0.3160 aloss:0.3897 eloss:3.2869 aloss2:2.1445 exploreP:0.0100
Episode:1725 meanR:176.6100 R:126.0000 rate:0.2520 aloss:0.3823 eloss:3.2673 aloss2:2.1490 exploreP:0.0100
Episode:1726 meanR:177.2200 R:126.0000 rate:0.2520 aloss:0.4010 eloss:3.2909 aloss2:2.1479 exploreP:0.0100
Episode:1727 meanR:175.7100 R:106.0000 rate:0.2120 aloss:0.3838 eloss:3.2759 aloss2:2.1477 exploreP:0.0100
Episode:1728 meanR:174.8100 R:92.0000 rate:0.1840 aloss:0.3790 eloss:3.2511 aloss2:2.1516 exploreP:0.0100
Episode:1729 meanR:174.4600 R:123.0000 rate:0.2460 aloss:0.3908 eloss:3.2662 aloss2:2.1581 exploreP:0.0100
Episode:1730 meanR:173.8900 R:137.0000 rate:0.2740 aloss:0.3869 eloss:3.2707 aloss2:2.1507 exploreP:0.0100
Episode:1731 meanR:173.9500 R:167.0000 

Episode:1799 meanR:124.4800 R:70.0000 rate:0.1400 aloss:0.3859 eloss:3.2565 aloss2:2.2174 exploreP:0.0100
Episode:1800 meanR:123.6000 R:120.0000 rate:0.2400 aloss:0.3941 eloss:3.2587 aloss2:2.1809 exploreP:0.0100
Episode:1801 meanR:123.3700 R:160.0000 rate:0.3200 aloss:0.3836 eloss:3.2027 aloss2:2.1832 exploreP:0.0100
Episode:1802 meanR:123.5700 R:150.0000 rate:0.3000 aloss:0.3869 eloss:3.2616 aloss2:2.1544 exploreP:0.0100
Episode:1803 meanR:123.0000 R:101.0000 rate:0.2020 aloss:0.3923 eloss:3.2555 aloss2:2.2145 exploreP:0.0100
Episode:1804 meanR:122.5600 R:65.0000 rate:0.1300 aloss:0.3848 eloss:3.2364 aloss2:2.2322 exploreP:0.0100
Episode:1805 meanR:123.3100 R:157.0000 rate:0.3140 aloss:0.3841 eloss:3.2456 aloss2:2.2031 exploreP:0.0100
Episode:1806 meanR:123.3700 R:188.0000 rate:0.3760 aloss:0.3813 eloss:3.1917 aloss2:2.1889 exploreP:0.0100
Episode:1807 meanR:122.2300 R:120.0000 rate:0.2400 aloss:0.3819 eloss:3.2464 aloss2:2.1569 exploreP:0.0100
Episode:1808 meanR:121.7400 R:70.0000 r

Episode:1876 meanR:125.2400 R:131.0000 rate:0.2620 aloss:0.3320 eloss:3.1773 aloss2:2.2081 exploreP:0.0100
Episode:1877 meanR:125.4800 R:105.0000 rate:0.2100 aloss:0.3526 eloss:3.1770 aloss2:2.2995 exploreP:0.0100
Episode:1878 meanR:125.6600 R:137.0000 rate:0.2740 aloss:0.3509 eloss:3.1125 aloss2:2.3743 exploreP:0.0100
Episode:1879 meanR:126.0200 R:148.0000 rate:0.2960 aloss:0.3464 eloss:3.1082 aloss2:2.3719 exploreP:0.0100
Episode:1880 meanR:126.3300 R:96.0000 rate:0.1920 aloss:0.3411 eloss:3.1560 aloss2:2.3479 exploreP:0.0100
Episode:1881 meanR:126.7900 R:121.0000 rate:0.2420 aloss:0.3460 eloss:3.1696 aloss2:2.3186 exploreP:0.0100
Episode:1882 meanR:127.3600 R:124.0000 rate:0.2480 aloss:0.3475 eloss:3.1146 aloss2:2.3314 exploreP:0.0100
Episode:1883 meanR:127.6000 R:166.0000 rate:0.3320 aloss:0.3372 eloss:3.0178 aloss2:2.3588 exploreP:0.0100
Episode:1884 meanR:127.2100 R:110.0000 rate:0.2200 aloss:0.3188 eloss:2.9489 aloss2:2.3298 exploreP:0.0100
Episode:1885 meanR:127.5500 R:129.0000

Episode:1953 meanR:138.9700 R:131.0000 rate:0.2620 aloss:0.3216 eloss:2.8680 aloss2:2.4173 exploreP:0.0100
Episode:1954 meanR:139.0800 R:136.0000 rate:0.2720 aloss:0.3211 eloss:2.9056 aloss2:2.3471 exploreP:0.0100
Episode:1955 meanR:139.7600 R:204.0000 rate:0.4080 aloss:0.3289 eloss:2.9631 aloss2:2.3154 exploreP:0.0100
Episode:1956 meanR:140.5600 R:152.0000 rate:0.3040 aloss:0.3405 eloss:3.1281 aloss2:2.2289 exploreP:0.0100
Episode:1957 meanR:140.4800 R:123.0000 rate:0.2460 aloss:0.3554 eloss:3.2034 aloss2:2.2599 exploreP:0.0100
Episode:1958 meanR:140.3000 R:112.0000 rate:0.2240 aloss:0.3595 eloss:3.1592 aloss2:2.3548 exploreP:0.0100
Episode:1959 meanR:139.8200 R:75.0000 rate:0.1500 aloss:0.3585 eloss:3.1141 aloss2:2.4127 exploreP:0.0100
Episode:1960 meanR:139.2900 R:138.0000 rate:0.2760 aloss:0.3560 eloss:3.0455 aloss2:2.4604 exploreP:0.0100
Episode:1961 meanR:139.0000 R:107.0000 rate:0.2140 aloss:0.3515 eloss:3.0393 aloss2:2.4707 exploreP:0.0100
Episode:1962 meanR:139.0800 R:176.0000

Episode:2030 meanR:146.9400 R:172.0000 rate:0.3440 aloss:0.3654 eloss:3.0094 aloss2:2.4659 exploreP:0.0100
Episode:2031 meanR:146.7600 R:123.0000 rate:0.2460 aloss:0.3669 eloss:3.0271 aloss2:2.4683 exploreP:0.0100
Episode:2032 meanR:146.5800 R:133.0000 rate:0.2660 aloss:0.3648 eloss:3.0126 aloss2:2.4613 exploreP:0.0100
Episode:2033 meanR:146.0300 R:146.0000 rate:0.2920 aloss:0.3625 eloss:3.0267 aloss2:2.4466 exploreP:0.0100
Episode:2034 meanR:145.7900 R:126.0000 rate:0.2520 aloss:0.3656 eloss:3.0056 aloss2:2.4684 exploreP:0.0100
Episode:2035 meanR:145.8700 R:173.0000 rate:0.3460 aloss:0.3589 eloss:3.0124 aloss2:2.4650 exploreP:0.0100
Episode:2036 meanR:145.8000 R:116.0000 rate:0.2320 aloss:0.3577 eloss:3.0151 aloss2:2.4422 exploreP:0.0100
Episode:2037 meanR:144.7900 R:44.0000 rate:0.0880 aloss:0.3600 eloss:3.0104 aloss2:2.4349 exploreP:0.0100
Episode:2038 meanR:143.7300 R:28.0000 rate:0.0560 aloss:0.3578 eloss:3.0125 aloss2:2.4906 exploreP:0.0100
Episode:2039 meanR:142.7500 R:163.0000 

Episode:2107 meanR:140.8600 R:137.0000 rate:0.2740 aloss:0.3580 eloss:2.9169 aloss2:2.4517 exploreP:0.0100
Episode:2108 meanR:140.6200 R:137.0000 rate:0.2740 aloss:0.3638 eloss:2.9351 aloss2:2.4480 exploreP:0.0100
Episode:2109 meanR:139.5200 R:21.0000 rate:0.0420 aloss:0.3505 eloss:2.9629 aloss2:2.4540 exploreP:0.0100
Episode:2110 meanR:139.6600 R:170.0000 rate:0.3400 aloss:0.3624 eloss:2.9319 aloss2:2.4681 exploreP:0.0100
Episode:2111 meanR:139.3600 R:123.0000 rate:0.2460 aloss:0.3656 eloss:2.9178 aloss2:2.4394 exploreP:0.0100
Episode:2112 meanR:139.5100 R:168.0000 rate:0.3360 aloss:0.3656 eloss:2.9317 aloss2:2.4562 exploreP:0.0100
Episode:2113 meanR:139.5100 R:163.0000 rate:0.3260 aloss:0.3638 eloss:2.9138 aloss2:2.4466 exploreP:0.0100
Episode:2114 meanR:138.4300 R:114.0000 rate:0.2280 aloss:0.3611 eloss:2.9068 aloss2:2.4216 exploreP:0.0100
Episode:2115 meanR:138.4300 R:145.0000 rate:0.2900 aloss:0.3606 eloss:2.9123 aloss2:2.4541 exploreP:0.0100
Episode:2116 meanR:138.2700 R:131.0000

Episode:2184 meanR:129.6500 R:103.0000 rate:0.2060 aloss:0.3556 eloss:2.3936 aloss2:2.9825 exploreP:0.0100
Episode:2185 meanR:128.7600 R:97.0000 rate:0.1940 aloss:0.3554 eloss:2.3420 aloss2:3.0083 exploreP:0.0100
Episode:2186 meanR:128.7100 R:104.0000 rate:0.2080 aloss:0.3596 eloss:2.3445 aloss2:3.0249 exploreP:0.0100
Episode:2187 meanR:128.0500 R:109.0000 rate:0.2180 aloss:0.3591 eloss:2.2955 aloss2:3.0798 exploreP:0.0100
Episode:2188 meanR:127.4900 R:106.0000 rate:0.2120 aloss:0.3595 eloss:2.3079 aloss2:3.0988 exploreP:0.0100
Episode:2189 meanR:126.9600 R:131.0000 rate:0.2620 aloss:0.3599 eloss:2.2140 aloss2:3.1729 exploreP:0.0100
Episode:2190 meanR:125.8800 R:38.0000 rate:0.0760 aloss:0.3690 eloss:2.1479 aloss2:3.2343 exploreP:0.0100
Episode:2191 meanR:126.1300 R:135.0000 rate:0.2700 aloss:0.3516 eloss:2.1660 aloss2:3.2541 exploreP:0.0100
Episode:2192 meanR:125.6500 R:105.0000 rate:0.2100 aloss:0.3565 eloss:2.1822 aloss2:3.2556 exploreP:0.0100
Episode:2193 meanR:125.0400 R:75.0000 r

Episode:2262 meanR:98.1800 R:112.0000 rate:0.2240 aloss:0.3358 eloss:1.4914 aloss2:4.3188 exploreP:0.0100
Episode:2263 meanR:97.9300 R:103.0000 rate:0.2060 aloss:0.3303 eloss:1.4201 aloss2:4.3616 exploreP:0.0100
Episode:2264 meanR:97.9600 R:106.0000 rate:0.2120 aloss:0.3217 eloss:1.5765 aloss2:4.1844 exploreP:0.0100
Episode:2265 meanR:98.1500 R:145.0000 rate:0.2900 aloss:0.3365 eloss:1.5064 aloss2:4.3569 exploreP:0.0100
Episode:2266 meanR:97.5900 R:103.0000 rate:0.2060 aloss:0.3300 eloss:1.3669 aloss2:4.4936 exploreP:0.0100
Episode:2267 meanR:97.5100 R:106.0000 rate:0.2120 aloss:0.3409 eloss:1.2498 aloss2:4.5935 exploreP:0.0100
Episode:2268 meanR:97.6700 R:143.0000 rate:0.2860 aloss:0.3263 eloss:1.3962 aloss2:4.4687 exploreP:0.0100
Episode:2269 meanR:99.0000 R:149.0000 rate:0.2980 aloss:0.3289 eloss:1.3840 aloss2:4.4049 exploreP:0.0100
Episode:2270 meanR:99.8800 R:110.0000 rate:0.2200 aloss:0.3277 eloss:1.3960 aloss2:4.4369 exploreP:0.0100
Episode:2271 meanR:98.7300 R:37.0000 rate:0.07

Episode:2339 meanR:111.3800 R:107.0000 rate:0.2140 aloss:0.3073 eloss:2.3132 aloss2:3.5074 exploreP:0.0100
Episode:2340 meanR:113.2900 R:248.0000 rate:0.4960 aloss:0.3036 eloss:2.3316 aloss2:3.4137 exploreP:0.0100
Episode:2341 meanR:113.1000 R:105.0000 rate:0.2100 aloss:0.3063 eloss:2.4287 aloss2:3.2683 exploreP:0.0100
Episode:2342 meanR:113.8700 R:137.0000 rate:0.2740 aloss:0.3086 eloss:2.4023 aloss2:3.3179 exploreP:0.0100
Episode:2343 meanR:114.7100 R:122.0000 rate:0.2440 aloss:0.3057 eloss:2.6832 aloss2:3.0510 exploreP:0.0100
Episode:2344 meanR:115.2700 R:119.0000 rate:0.2380 aloss:0.3181 eloss:2.6913 aloss2:3.0448 exploreP:0.0100
Episode:2345 meanR:116.2300 R:204.0000 rate:0.4080 aloss:0.3209 eloss:2.7425 aloss2:2.9626 exploreP:0.0100
Episode:2346 meanR:116.1800 R:117.0000 rate:0.2340 aloss:0.3184 eloss:2.7548 aloss2:2.9825 exploreP:0.0100
Episode:2347 meanR:116.0600 R:104.0000 rate:0.2080 aloss:0.3124 eloss:2.8015 aloss2:2.8778 exploreP:0.0100
Episode:2348 meanR:116.4200 R:154.000

Episode:2416 meanR:134.9100 R:175.0000 rate:0.3500 aloss:0.2867 eloss:3.0080 aloss2:2.9606 exploreP:0.0100
Episode:2417 meanR:135.0300 R:117.0000 rate:0.2340 aloss:0.2968 eloss:2.9941 aloss2:2.9537 exploreP:0.0100
Episode:2418 meanR:135.1800 R:113.0000 rate:0.2260 aloss:0.2838 eloss:3.0239 aloss2:2.9282 exploreP:0.0100
Episode:2419 meanR:134.5200 R:131.0000 rate:0.2620 aloss:0.2844 eloss:3.0267 aloss2:2.9223 exploreP:0.0100
Episode:2420 meanR:135.6500 R:161.0000 rate:0.3220 aloss:0.2849 eloss:3.0124 aloss2:2.9488 exploreP:0.0100
Episode:2421 meanR:135.5300 R:112.0000 rate:0.2240 aloss:0.2836 eloss:3.0321 aloss2:2.8980 exploreP:0.0100
Episode:2422 meanR:135.3200 R:111.0000 rate:0.2220 aloss:0.3032 eloss:3.0692 aloss2:2.9030 exploreP:0.0100
Episode:2423 meanR:135.0700 R:148.0000 rate:0.2960 aloss:0.2815 eloss:3.0180 aloss2:2.8947 exploreP:0.0100
Episode:2424 meanR:135.0300 R:144.0000 rate:0.2880 aloss:0.2933 eloss:3.0144 aloss2:2.9247 exploreP:0.0100
Episode:2425 meanR:135.0900 R:116.000

Episode:2493 meanR:135.6500 R:126.0000 rate:0.2520 aloss:0.2589 eloss:3.0250 aloss2:2.8595 exploreP:0.0100
Episode:2494 meanR:135.6100 R:127.0000 rate:0.2540 aloss:0.2627 eloss:3.0401 aloss2:2.8502 exploreP:0.0100
Episode:2495 meanR:135.4400 R:133.0000 rate:0.2660 aloss:0.2674 eloss:3.0453 aloss2:2.8646 exploreP:0.0100
Episode:2496 meanR:135.5800 R:176.0000 rate:0.3520 aloss:0.2575 eloss:3.0208 aloss2:2.8958 exploreP:0.0100
Episode:2497 meanR:136.5400 R:212.0000 rate:0.4240 aloss:0.2607 eloss:2.9961 aloss2:2.9143 exploreP:0.0100
Episode:2498 meanR:136.4500 R:113.0000 rate:0.2260 aloss:0.2721 eloss:3.0491 aloss2:2.8091 exploreP:0.0100
Episode:2499 meanR:136.7000 R:141.0000 rate:0.2820 aloss:0.2466 eloss:2.9889 aloss2:2.8916 exploreP:0.0100
Episode:2500 meanR:136.6800 R:126.0000 rate:0.2520 aloss:0.2568 eloss:3.0317 aloss2:2.7766 exploreP:0.0100
Episode:2501 meanR:135.3400 R:127.0000 rate:0.2540 aloss:0.2621 eloss:3.0007 aloss2:2.8256 exploreP:0.0100
Episode:2502 meanR:135.4700 R:138.000

Episode:2570 meanR:138.6100 R:185.0000 rate:0.3700 aloss:0.2142 eloss:2.9254 aloss2:2.7636 exploreP:0.0100
Episode:2571 meanR:138.4100 R:113.0000 rate:0.2260 aloss:0.2184 eloss:2.9119 aloss2:2.7100 exploreP:0.0100
Episode:2572 meanR:138.4000 R:118.0000 rate:0.2360 aloss:0.2103 eloss:2.9196 aloss2:2.7040 exploreP:0.0100
Episode:2573 meanR:138.1000 R:116.0000 rate:0.2320 aloss:0.2258 eloss:2.8659 aloss2:2.7529 exploreP:0.0100
Episode:2574 meanR:138.2700 R:136.0000 rate:0.2720 aloss:0.2146 eloss:2.9385 aloss2:2.6784 exploreP:0.0100
Episode:2575 meanR:138.6800 R:151.0000 rate:0.3020 aloss:0.2068 eloss:2.9340 aloss2:2.6647 exploreP:0.0100
Episode:2576 meanR:138.7800 R:123.0000 rate:0.2460 aloss:0.2150 eloss:2.9290 aloss2:2.6595 exploreP:0.0100
Episode:2577 meanR:139.0500 R:148.0000 rate:0.2960 aloss:0.2171 eloss:2.9181 aloss2:2.6572 exploreP:0.0100
Episode:2578 meanR:139.5100 R:182.0000 rate:0.3640 aloss:0.2132 eloss:2.9337 aloss2:2.6643 exploreP:0.0100
Episode:2579 meanR:139.2800 R:116.000

Episode:2647 meanR:131.3000 R:135.0000 rate:0.2700 aloss:0.1931 eloss:3.0064 aloss2:2.7676 exploreP:0.0100
Episode:2648 meanR:131.6100 R:178.0000 rate:0.3560 aloss:0.1915 eloss:2.9972 aloss2:2.7746 exploreP:0.0100
Episode:2649 meanR:130.9300 R:109.0000 rate:0.2180 aloss:0.1829 eloss:2.9763 aloss2:2.7905 exploreP:0.0100
Episode:2650 meanR:130.6000 R:106.0000 rate:0.2120 aloss:0.1913 eloss:2.9743 aloss2:2.8022 exploreP:0.0100
Episode:2651 meanR:131.1400 R:192.0000 rate:0.3840 aloss:0.1886 eloss:2.9692 aloss2:2.7912 exploreP:0.0100
Episode:2652 meanR:131.2900 R:114.0000 rate:0.2280 aloss:0.1967 eloss:2.9881 aloss2:2.7916 exploreP:0.0100
Episode:2653 meanR:131.3900 R:122.0000 rate:0.2440 aloss:0.2100 eloss:3.0211 aloss2:2.7928 exploreP:0.0100
Episode:2654 meanR:131.4400 R:132.0000 rate:0.2640 aloss:0.2397 eloss:3.0585 aloss2:2.7548 exploreP:0.0100
Episode:2655 meanR:131.8700 R:150.0000 rate:0.3000 aloss:0.3074 eloss:3.0809 aloss2:2.7078 exploreP:0.0100
Episode:2656 meanR:131.6600 R:118.000

Episode:2724 meanR:125.5000 R:152.0000 rate:0.3040 aloss:0.2345 eloss:2.9352 aloss2:2.7517 exploreP:0.0100
Episode:2725 meanR:125.5000 R:110.0000 rate:0.2200 aloss:0.2191 eloss:2.9345 aloss2:2.7806 exploreP:0.0100
Episode:2726 meanR:124.5500 R:115.0000 rate:0.2300 aloss:0.2257 eloss:2.9047 aloss2:2.7829 exploreP:0.0100
Episode:2727 meanR:124.3900 R:143.0000 rate:0.2860 aloss:0.2335 eloss:2.9342 aloss2:2.7858 exploreP:0.0100
Episode:2728 meanR:124.5000 R:153.0000 rate:0.3060 aloss:0.2397 eloss:2.9572 aloss2:2.7565 exploreP:0.0100
Episode:2729 meanR:124.6300 R:142.0000 rate:0.2840 aloss:0.2303 eloss:2.9683 aloss2:2.7375 exploreP:0.0100
Episode:2730 meanR:124.7900 R:135.0000 rate:0.2700 aloss:0.2519 eloss:2.9652 aloss2:2.7755 exploreP:0.0100
Episode:2731 meanR:124.7800 R:142.0000 rate:0.2840 aloss:0.2586 eloss:2.9625 aloss2:2.7436 exploreP:0.0100
Episode:2732 meanR:125.0400 R:155.0000 rate:0.3100 aloss:0.2380 eloss:2.9471 aloss2:2.7419 exploreP:0.0100
Episode:2733 meanR:125.0600 R:100.000

Episode:2801 meanR:126.7200 R:108.0000 rate:0.2160 aloss:0.3871 eloss:2.9464 aloss2:3.1042 exploreP:0.0100
Episode:2802 meanR:126.6700 R:103.0000 rate:0.2060 aloss:0.3325 eloss:2.9781 aloss2:2.7765 exploreP:0.0100
Episode:2803 meanR:126.7300 R:115.0000 rate:0.2300 aloss:0.3558 eloss:3.0231 aloss2:2.7825 exploreP:0.0100
Episode:2804 meanR:127.3900 R:165.0000 rate:0.3300 aloss:0.3246 eloss:3.3106 aloss2:2.7441 exploreP:0.0100
Episode:2805 meanR:127.5900 R:130.0000 rate:0.2600 aloss:0.2993 eloss:3.8593 aloss2:2.9086 exploreP:0.0100
Episode:2806 meanR:127.3500 R:105.0000 rate:0.2100 aloss:0.3122 eloss:3.7508 aloss2:3.0954 exploreP:0.0100
Episode:2807 meanR:126.6600 R:56.0000 rate:0.1120 aloss:0.3399 eloss:3.5268 aloss2:2.9585 exploreP:0.0100
Episode:2808 meanR:126.7500 R:108.0000 rate:0.2160 aloss:0.3953 eloss:3.4411 aloss2:3.0377 exploreP:0.0100
Episode:2809 meanR:126.1200 R:57.0000 rate:0.1140 aloss:0.3902 eloss:3.3365 aloss2:3.0444 exploreP:0.0100
Episode:2810 meanR:125.6600 R:64.0000 r

Episode:2879 meanR:73.0100 R:45.0000 rate:0.0900 aloss:0.3700 eloss:2.0511 aloss2:4.5292 exploreP:0.0100
Episode:2880 meanR:72.1700 R:31.0000 rate:0.0620 aloss:0.3994 eloss:1.9303 aloss2:4.8736 exploreP:0.0100
Episode:2881 meanR:71.1500 R:29.0000 rate:0.0580 aloss:0.4860 eloss:2.1152 aloss2:4.7090 exploreP:0.0100
Episode:2882 meanR:70.8600 R:47.0000 rate:0.0940 aloss:0.3743 eloss:1.9430 aloss2:4.7411 exploreP:0.0100
Episode:2883 meanR:70.4200 R:43.0000 rate:0.0860 aloss:0.3977 eloss:2.0858 aloss2:4.6545 exploreP:0.0100
Episode:2884 meanR:69.8700 R:35.0000 rate:0.0700 aloss:0.3564 eloss:1.3608 aloss2:5.6772 exploreP:0.0100
Episode:2885 meanR:68.9700 R:29.0000 rate:0.0580 aloss:0.4357 eloss:1.4712 aloss2:5.4552 exploreP:0.0100
Episode:2886 meanR:67.8500 R:32.0000 rate:0.0640 aloss:0.4247 eloss:2.1096 aloss2:4.5710 exploreP:0.0100
Episode:2887 meanR:66.4400 R:33.0000 rate:0.0660 aloss:0.3937 eloss:1.4393 aloss2:5.3334 exploreP:0.0100
Episode:2888 meanR:65.6700 R:37.0000 rate:0.0740 aloss:

Episode:2958 meanR:44.4000 R:50.0000 rate:0.1000 aloss:0.4505 eloss:0.7872 aloss2:5.1795 exploreP:0.0100
Episode:2959 meanR:44.4600 R:37.0000 rate:0.0740 aloss:0.4018 eloss:0.8573 aloss2:5.1315 exploreP:0.0100
Episode:2960 meanR:43.9400 R:50.0000 rate:0.1000 aloss:0.4233 eloss:1.1020 aloss2:4.8030 exploreP:0.0100
Episode:2961 meanR:43.8300 R:31.0000 rate:0.0620 aloss:0.3861 eloss:1.3509 aloss2:4.5846 exploreP:0.0100
Episode:2962 meanR:43.1400 R:35.0000 rate:0.0700 aloss:0.3929 eloss:0.8569 aloss2:5.0422 exploreP:0.0100
Episode:2963 meanR:43.1000 R:35.0000 rate:0.0700 aloss:0.4023 eloss:0.5947 aloss2:5.3389 exploreP:0.0100
Episode:2964 meanR:42.3900 R:43.0000 rate:0.0860 aloss:0.4425 eloss:0.8431 aloss2:5.0947 exploreP:0.0100
Episode:2965 meanR:42.2900 R:33.0000 rate:0.0660 aloss:0.3881 eloss:0.7243 aloss2:5.2126 exploreP:0.0100
Episode:2966 meanR:41.4800 R:37.0000 rate:0.0740 aloss:0.3693 eloss:1.1767 aloss2:4.7200 exploreP:0.0100
Episode:2967 meanR:41.5700 R:40.0000 rate:0.0800 aloss:

Episode:3037 meanR:43.4000 R:39.0000 rate:0.0780 aloss:0.3126 eloss:2.3584 aloss2:3.3697 exploreP:0.0100
Episode:3038 meanR:43.3200 R:27.0000 rate:0.0540 aloss:0.3100 eloss:2.2228 aloss2:3.5304 exploreP:0.0100
Episode:3039 meanR:44.1800 R:123.0000 rate:0.2460 aloss:0.3253 eloss:2.3248 aloss2:3.4127 exploreP:0.0100
Episode:3040 meanR:44.9600 R:113.0000 rate:0.2260 aloss:0.2833 eloss:2.3648 aloss2:3.3818 exploreP:0.0100
Episode:3041 meanR:45.6600 R:107.0000 rate:0.2140 aloss:0.2849 eloss:2.5500 aloss2:3.2106 exploreP:0.0100
Episode:3042 meanR:46.5400 R:131.0000 rate:0.2620 aloss:0.2721 eloss:2.6524 aloss2:3.0865 exploreP:0.0100
Episode:3043 meanR:47.3400 R:125.0000 rate:0.2500 aloss:0.2875 eloss:2.6678 aloss2:3.0994 exploreP:0.0100
Episode:3044 meanR:48.0500 R:106.0000 rate:0.2120 aloss:0.2561 eloss:2.7923 aloss2:2.9723 exploreP:0.0100
Episode:3045 meanR:48.6200 R:102.0000 rate:0.2040 aloss:0.2596 eloss:2.9121 aloss2:2.8349 exploreP:0.0100
Episode:3046 meanR:49.5100 R:130.0000 rate:0.260

# Visualizing training

Below I'll plot the total rewards for each episode. I'm plotting the rolling average too, in blue.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def running_mean(x, N):
    cumsum = np.cumsum(np.insert(x, 0, 0)) 
    return (cumsum[N:] - cumsum[:-N]) / N 

In [None]:
eps, arr = np.array(episode_rewards_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Total rewards')

In [None]:
eps, arr = np.array(rewards_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Total rewards')

In [None]:
eps, arr = np.array(aloss_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Act losses')

In [None]:
eps, arr = np.array(eloss_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Env losses')

In [None]:
eps, arr = np.array(aloss2_list).T
smoothed_arr = running_mean(arr, 10)
plt.plot(eps[-len(smoothed_arr):], smoothed_arr)
plt.plot(eps, arr, color='grey', alpha=0.3)
plt.xlabel('Episode')
plt.ylabel('Act losses 2')

## Testing

Let's checkout how our trained agent plays the game.

In [21]:
import gym
env = gym.make('CartPole-v0')
env = gym.make('CartPole-v1')

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    saver.restore(sess, 'checkpoints/model.ckpt')    
    #saver.restore(sess, tf.train.latest_checkpoint('checkpoints'))
    
    # Episodes/epochs
    for _ in range(10):
        state = env.reset()
        total_reward = 0

        # Steps/batches
        while True:
            env.render()
            action_logits = sess.run(model.actions_logits, feed_dict={model.states: state.reshape([1, -1])})
            action = np.argmax(action_logits)
            state, reward, done, _ = env.step(action)
            total_reward += reward
            if done:
                print('total_reward: {}'.format(total_reward))
                break
                
env.close()

INFO:tensorflow:Restoring parameters from checkpoints/model.ckpt
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
total_reward: 500.0
