In [4]:
import numpy as np
import pickle  # 用 pickle 替代 cPickle
import gym
np.bool8 = np.bool_

# 超参数
H = 200  # 隐藏层神经元数量
batch_size = 10  # 每多少集进行一次参数更新
learning_rate = 1e-3
gamma = 0.99  # 折扣因子
decay_rate = 0.99  # RMSProp 衰减因子
resume = False  # 是否从以前的检查点恢复
render = False

# 模型初始化
D = 80 * 80  # 输入维度：80x80 网格


model = {}
model['W1'] = np.random.randn(H, D) / np.sqrt(D)  # "Xavier" 初始化
model['W2'] = np.random.randn(H) / np.sqrt(H)

# 更新缓存，用于 RMSProp 平滑梯度
grad_buffer = {k: np.zeros_like(v) for k, v in model.items()}
rmsprop_cache = {k: np.zeros_like(v) for k, v in model.items()}

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))  # sigmoid 函数

def prepro(I):
    """将 210x160x3 uint8 帧预处理为 6400 (80x80) 1D float 向量"""
    I = I[35:195]  # 裁剪
    I = I[::2, ::2, 0]  # 下采样因子为 2
    I[I == 144] = 0  # 删除背景类型 1
    I[I == 109] = 0  # 删除背景类型 2
    I[I != 0] = 1  # 其他设置为 1
    return I.astype(np.float32).ravel()

def discount_rewards(r):
    """对奖励进行折扣"""
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(range(r.size)):
        if r[t] != 0: running_add = 0  # 如果是游戏边界（仅对 Pong 特有）
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

def policy_forward(x):
    h = np.dot(model['W1'], x)
    h[h < 0] = 0  # ReLU
    logp = np.dot(model['W2'], h)
    p = sigmoid(logp)
    return p, h  # 返回采取动作 2 的概率和隐藏状态

def policy_backward(eph, epdlogp):
    """反向传播（eph 为隐藏状态数组）"""
    dW2 = np.dot(eph.T, epdlogp).ravel()
    dh = np.outer(epdlogp, model['W2'])
    dh[eph <= 0] = 0  # Backprop ReLU
    dW1 = np.dot(dh.T, epx)
    return {'W1': dW1, 'W2': dW2}

# 训练
env = gym.make("Pong-v4")
observation, _ = env.reset()
prev_x = None  # 用于计算差分帧
xs, hs, dlogps, drs = [], [], [], []
running_reward = None
reward_sum = 0
episode_number = 0

while True:
    if render: env.render()

    # 预处理观测，设定网络输入为差分图像
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # 前向传播并采样动作
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3  # 投掷骰子！

    # 记录中间变量（后续用于反向传播）
    xs.append(x)  # 观测值
    hs.append(h)  # 隐藏状态
    y = 1 if action == 2 else 0  # “伪标签”
    dlogps.append(y - aprob)  # 梯度鼓励采取的动作

    # 环境步进，获取新状态
    observation, reward, terminated, truncated, info = env.step(action)  # 更新为新的返回值
    reward_sum += reward

    drs.append(reward)  # 记录奖励

    if terminated or truncated:  # 一个回合结束
        episode_number += 1

        # 堆叠输入、隐藏状态、动作梯度和奖励
        epx = np.vstack(xs)
        eph = np.vstack(hs)
        epdlogp = np.vstack(dlogps)
        epr = np.vstack(drs)
        xs, hs, dlogps, drs = [], [], [], []  # 重置数组

        # 计算折扣奖励
        discounted_epr = discount_rewards(epr)
        # 标准化奖励
        discounted_epr -= np.mean(discounted_epr)
        discounted_epr /= np.std(discounted_epr)

        epdlogp *= discounted_epr  # 使用优势调制梯度
        grad = policy_backward(eph, epdlogp)
        for k in model: grad_buffer[k] += grad[k]  # 在 batch 中累积梯度

        # 每 batch_size 集执行 RMSProp 参数更新
        if episode_number % batch_size == 0:
            for k, v in model.items():
                g = grad_buffer[k]
                rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g ** 2
                model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
                grad_buffer[k] = np.zeros_like(v)  # 重置 batch 梯度缓存

        # 记录
        running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
        print(f'resetting env. episode reward total was {reward_sum}. running mean: {running_reward}')
        if episode_number % 100 == 0:
            with open('save.p', 'wb') as f:
                pickle.dump(model, f)
        reward_sum = 0
        observation, _ = env.reset()
        prev_x = None

resetting env. episode reward total was -21.0. running mean: -21.0
resetting env. episode reward total was -21.0. running mean: -21.0
resetting env. episode reward total was -21.0. running mean: -21.0
resetting env. episode reward total was -20.0. running mean: -20.99
resetting env. episode reward total was -20.0. running mean: -20.980099999999997
resetting env. episode reward total was -20.0. running mean: -20.970298999999997
resetting env. episode reward total was -19.0. running mean: -20.950596009999998
resetting env. episode reward total was -20.0. running mean: -20.941090049899998
resetting env. episode reward total was -21.0. running mean: -20.941679149400997
resetting env. episode reward total was -21.0. running mean: -20.942262357906987
resetting env. episode reward total was -21.0. running mean: -20.94283973432792
resetting env. episode reward total was -20.0. running mean: -20.933411336984637
resetting env. episode reward total was -20.0. running mean: -20.92407722361479
rese

resetting env. episode reward total was -21.0. running mean: -20.752621948825123
resetting env. episode reward total was -19.0. running mean: -20.735095729336873
resetting env. episode reward total was -21.0. running mean: -20.737744772043506
resetting env. episode reward total was -21.0. running mean: -20.74036732432307
resetting env. episode reward total was -21.0. running mean: -20.74296365107984
resetting env. episode reward total was -21.0. running mean: -20.745534014569042
resetting env. episode reward total was -21.0. running mean: -20.748078674423354
resetting env. episode reward total was -19.0. running mean: -20.73059788767912
resetting env. episode reward total was -21.0. running mean: -20.73329190880233
resetting env. episode reward total was -20.0. running mean: -20.725958989714304
resetting env. episode reward total was -21.0. running mean: -20.72869939981716
resetting env. episode reward total was -21.0. running mean: -20.73141240581899
resetting env. episode reward tota

resetting env. episode reward total was -20.0. running mean: -20.71709348850191
resetting env. episode reward total was -21.0. running mean: -20.71992255361689
resetting env. episode reward total was -21.0. running mean: -20.722723328080722
resetting env. episode reward total was -21.0. running mean: -20.725496094799915
resetting env. episode reward total was -21.0. running mean: -20.728241133851917
resetting env. episode reward total was -21.0. running mean: -20.7309587225134
resetting env. episode reward total was -21.0. running mean: -20.733649135288267
resetting env. episode reward total was -20.0. running mean: -20.726312643935383
resetting env. episode reward total was -21.0. running mean: -20.72904951749603
resetting env. episode reward total was -20.0. running mean: -20.72175902232107
resetting env. episode reward total was -20.0. running mean: -20.714541432097857
resetting env. episode reward total was -20.0. running mean: -20.707396017776876
resetting env. episode reward tota

resetting env. episode reward total was -19.0. running mean: -20.164668211346605
resetting env. episode reward total was -21.0. running mean: -20.17302152923314
resetting env. episode reward total was -18.0. running mean: -20.151291313940806
resetting env. episode reward total was -19.0. running mean: -20.1397784008014
resetting env. episode reward total was -19.0. running mean: -20.128380616793386
resetting env. episode reward total was -18.0. running mean: -20.107096810625453
resetting env. episode reward total was -21.0. running mean: -20.1160258425192
resetting env. episode reward total was -20.0. running mean: -20.114865584094005
resetting env. episode reward total was -20.0. running mean: -20.113716928253066
resetting env. episode reward total was -21.0. running mean: -20.122579758970534
resetting env. episode reward total was -20.0. running mean: -20.121353961380827
resetting env. episode reward total was -21.0. running mean: -20.13014042176702
resetting env. episode reward tota

resetting env. episode reward total was -19.0. running mean: -19.74490804301481
resetting env. episode reward total was -21.0. running mean: -19.757458962584664
resetting env. episode reward total was -19.0. running mean: -19.749884372958817
resetting env. episode reward total was -19.0. running mean: -19.74238552922923
resetting env. episode reward total was -19.0. running mean: -19.73496167393694
resetting env. episode reward total was -19.0. running mean: -19.72761205719757
resetting env. episode reward total was -20.0. running mean: -19.730335936625597
resetting env. episode reward total was -19.0. running mean: -19.723032577259342
resetting env. episode reward total was -17.0. running mean: -19.69580225148675
resetting env. episode reward total was -19.0. running mean: -19.688844228971885
resetting env. episode reward total was -19.0. running mean: -19.681955786682167
resetting env. episode reward total was -16.0. running mean: -19.645136228815346
resetting env. episode reward tot

resetting env. episode reward total was -18.0. running mean: -19.128973274813728
resetting env. episode reward total was -21.0. running mean: -19.14768354206559
resetting env. episode reward total was -20.0. running mean: -19.156206706644934
resetting env. episode reward total was -18.0. running mean: -19.144644639578484
resetting env. episode reward total was -20.0. running mean: -19.1531981931827
resetting env. episode reward total was -18.0. running mean: -19.14166621125087
resetting env. episode reward total was -15.0. running mean: -19.10024954913836
resetting env. episode reward total was -19.0. running mean: -19.09924705364698
resetting env. episode reward total was -21.0. running mean: -19.11825458311051
resetting env. episode reward total was -18.0. running mean: -19.107072037279405
resetting env. episode reward total was -18.0. running mean: -19.09600131690661
resetting env. episode reward total was -18.0. running mean: -19.085041303737544
resetting env. episode reward total 

resetting env. episode reward total was -10.0. running mean: -18.573268666770506
resetting env. episode reward total was -17.0. running mean: -18.557535980102802
resetting env. episode reward total was -17.0. running mean: -18.541960620301776
resetting env. episode reward total was -21.0. running mean: -18.56654101409876
resetting env. episode reward total was -16.0. running mean: -18.540875603957772
resetting env. episode reward total was -16.0. running mean: -18.515466847918194
resetting env. episode reward total was -19.0. running mean: -18.520312179439014
resetting env. episode reward total was -19.0. running mean: -18.525109057644624
resetting env. episode reward total was -18.0. running mean: -18.519857967068177
resetting env. episode reward total was -18.0. running mean: -18.514659387397494
resetting env. episode reward total was -16.0. running mean: -18.48951279352352
resetting env. episode reward total was -17.0. running mean: -18.474617665588287
resetting env. episode reward 

resetting env. episode reward total was -21.0. running mean: -18.02990796181358
resetting env. episode reward total was -21.0. running mean: -18.059608882195445
resetting env. episode reward total was -20.0. running mean: -18.07901279337349
resetting env. episode reward total was -19.0. running mean: -18.088222665439755
resetting env. episode reward total was -19.0. running mean: -18.09734043878536
resetting env. episode reward total was -19.0. running mean: -18.106367034397508
resetting env. episode reward total was -19.0. running mean: -18.115303364053535
resetting env. episode reward total was -15.0. running mean: -18.084150330413
resetting env. episode reward total was -18.0. running mean: -18.08330882710887
resetting env. episode reward total was -20.0. running mean: -18.10247573883778
resetting env. episode reward total was -17.0. running mean: -18.091450981449402
resetting env. episode reward total was -15.0. running mean: -18.060536471634908
resetting env. episode reward total 

resetting env. episode reward total was -17.0. running mean: -17.644941741557382
resetting env. episode reward total was -16.0. running mean: -17.628492324141806
resetting env. episode reward total was -16.0. running mean: -17.612207400900388
resetting env. episode reward total was -15.0. running mean: -17.586085326891382
resetting env. episode reward total was -19.0. running mean: -17.60022447362247
resetting env. episode reward total was -16.0. running mean: -17.584222228886244
resetting env. episode reward total was -16.0. running mean: -17.56838000659738
resetting env. episode reward total was -19.0. running mean: -17.582696206531406
resetting env. episode reward total was -18.0. running mean: -17.58686924446609
resetting env. episode reward total was -16.0. running mean: -17.57100055202143
resetting env. episode reward total was -13.0. running mean: -17.525290546501214
resetting env. episode reward total was -19.0. running mean: -17.540037641036204
resetting env. episode reward to

resetting env. episode reward total was -18.0. running mean: -17.10899328434351
resetting env. episode reward total was -15.0. running mean: -17.08790335150007
resetting env. episode reward total was -16.0. running mean: -17.07702431798507
resetting env. episode reward total was -15.0. running mean: -17.056254074805217
resetting env. episode reward total was -14.0. running mean: -17.025691534057167
resetting env. episode reward total was -20.0. running mean: -17.055434618716593
resetting env. episode reward total was -19.0. running mean: -17.074880272529427
resetting env. episode reward total was -17.0. running mean: -17.074131469804133
resetting env. episode reward total was -21.0. running mean: -17.113390155106092
resetting env. episode reward total was -16.0. running mean: -17.10225625355503
resetting env. episode reward total was -19.0. running mean: -17.12123369101948
resetting env. episode reward total was -19.0. running mean: -17.140021354109287
resetting env. episode reward tot

resetting env. episode reward total was -19.0. running mean: -16.134529780177587
resetting env. episode reward total was -16.0. running mean: -16.13318448237581
resetting env. episode reward total was -19.0. running mean: -16.16185263755205
resetting env. episode reward total was -19.0. running mean: -16.190234111176533
resetting env. episode reward total was -15.0. running mean: -16.178331770064766
resetting env. episode reward total was -16.0. running mean: -16.176548452364116
resetting env. episode reward total was -6.0. running mean: -16.074782967840473
resetting env. episode reward total was -13.0. running mean: -16.04403513816207
resetting env. episode reward total was -14.0. running mean: -16.02359478678045
resetting env. episode reward total was -16.0. running mean: -16.023358838912646
resetting env. episode reward total was -9.0. running mean: -15.953125250523518
resetting env. episode reward total was -15.0. running mean: -15.943593998018283
resetting env. episode reward tota

resetting env. episode reward total was -11.0. running mean: -15.267207105140399
resetting env. episode reward total was -10.0. running mean: -15.214535034088994
resetting env. episode reward total was -14.0. running mean: -15.202389683748105
resetting env. episode reward total was -9.0. running mean: -15.140365786910625
resetting env. episode reward total was -16.0. running mean: -15.148962129041518
resetting env. episode reward total was -15.0. running mean: -15.147472507751104
resetting env. episode reward total was -19.0. running mean: -15.185997782673592
resetting env. episode reward total was -15.0. running mean: -15.184137804846856
resetting env. episode reward total was -19.0. running mean: -15.222296426798387
resetting env. episode reward total was -15.0. running mean: -15.220073462530403
resetting env. episode reward total was -18.0. running mean: -15.247872727905099
resetting env. episode reward total was -15.0. running mean: -15.245394000626048
resetting env. episode reward

resetting env. episode reward total was -17.0. running mean: -14.087248644151332
resetting env. episode reward total was -17.0. running mean: -14.11637615770982
resetting env. episode reward total was -12.0. running mean: -14.09521239613272
resetting env. episode reward total was -19.0. running mean: -14.144260272171392
resetting env. episode reward total was -21.0. running mean: -14.212817669449679
resetting env. episode reward total was -7.0. running mean: -14.140689492755182
resetting env. episode reward total was -10.0. running mean: -14.09928259782763
resetting env. episode reward total was -8.0. running mean: -14.038289771849353
resetting env. episode reward total was -16.0. running mean: -14.05790687413086
resetting env. episode reward total was -9.0. running mean: -14.00732780538955
resetting env. episode reward total was -13.0. running mean: -13.997254527335656
resetting env. episode reward total was -10.0. running mean: -13.957281982062298
resetting env. episode reward total 

resetting env. episode reward total was -8.0. running mean: -12.877279417389756
resetting env. episode reward total was -10.0. running mean: -12.848506623215858
resetting env. episode reward total was -13.0. running mean: -12.8500215569837
resetting env. episode reward total was -9.0. running mean: -12.811521341413863
resetting env. episode reward total was 2.0. running mean: -12.663406127999725
resetting env. episode reward total was -11.0. running mean: -12.646772066719727
resetting env. episode reward total was -11.0. running mean: -12.63030434605253
resetting env. episode reward total was -9.0. running mean: -12.594001302592003
resetting env. episode reward total was -15.0. running mean: -12.618061289566084
resetting env. episode reward total was -8.0. running mean: -12.571880676670423
resetting env. episode reward total was -10.0. running mean: -12.546161869903719
resetting env. episode reward total was -8.0. running mean: -12.500700251204682
resetting env. episode reward total wa

resetting env. episode reward total was -9.0. running mean: -11.649904047673022
resetting env. episode reward total was -15.0. running mean: -11.683405007196292
resetting env. episode reward total was -11.0. running mean: -11.676570957124328
resetting env. episode reward total was -18.0. running mean: -11.739805247553084
resetting env. episode reward total was -10.0. running mean: -11.722407195077553
resetting env. episode reward total was -11.0. running mean: -11.715183123126776
resetting env. episode reward total was -12.0. running mean: -11.718031291895507
resetting env. episode reward total was -3.0. running mean: -11.63085097897655
resetting env. episode reward total was -14.0. running mean: -11.654542469186785
resetting env. episode reward total was -9.0. running mean: -11.627997044494917
resetting env. episode reward total was -18.0. running mean: -11.691717074049967
resetting env. episode reward total was -8.0. running mean: -11.654799903309467
resetting env. episode reward tot

resetting env. episode reward total was -5.0. running mean: -11.178774196521475
resetting env. episode reward total was -16.0. running mean: -11.226986454556261
resetting env. episode reward total was -11.0. running mean: -11.224716590010697
resetting env. episode reward total was -12.0. running mean: -11.232469424110588
resetting env. episode reward total was -12.0. running mean: -11.240144729869481
resetting env. episode reward total was -7.0. running mean: -11.197743282570787
resetting env. episode reward total was -6.0. running mean: -11.14576584974508
resetting env. episode reward total was -5.0. running mean: -11.084308191247631
resetting env. episode reward total was -8.0. running mean: -11.053465109335155
resetting env. episode reward total was -7.0. running mean: -11.012930458241804
resetting env. episode reward total was -9.0. running mean: -10.992801153659386
resetting env. episode reward total was -13.0. running mean: -11.012873142122793
resetting env. episode reward total 

resetting env. episode reward total was -7.0. running mean: -8.688892498566636
resetting env. episode reward total was -9.0. running mean: -8.692003573580969
resetting env. episode reward total was -6.0. running mean: -8.66508353784516
resetting env. episode reward total was -11.0. running mean: -8.688432702466708
resetting env. episode reward total was -5.0. running mean: -8.651548375442042
resetting env. episode reward total was -10.0. running mean: -8.665032891687622
resetting env. episode reward total was 7.0. running mean: -8.508382562770745
resetting env. episode reward total was -7.0. running mean: -8.493298737143038
resetting env. episode reward total was -5.0. running mean: -8.458365749771607
resetting env. episode reward total was -10.0. running mean: -8.47378209227389
resetting env. episode reward total was -12.0. running mean: -8.50904427135115
resetting env. episode reward total was -8.0. running mean: -8.50395382863764
resetting env. episode reward total was -7.0. running

resetting env. episode reward total was -10.0. running mean: -7.341181932461678
resetting env. episode reward total was -5.0. running mean: -7.317770113137061
resetting env. episode reward total was -5.0. running mean: -7.29459241200569
resetting env. episode reward total was 10.0. running mean: -7.121646487885633
resetting env. episode reward total was -2.0. running mean: -7.070430023006777
resetting env. episode reward total was -4.0. running mean: -7.039725722776709
resetting env. episode reward total was -6.0. running mean: -7.029328465548942
resetting env. episode reward total was -9.0. running mean: -7.049035180893452
resetting env. episode reward total was -8.0. running mean: -7.058544829084518
resetting env. episode reward total was -8.0. running mean: -7.067959380793672
resetting env. episode reward total was 8.0. running mean: -6.917279786985735
resetting env. episode reward total was 1.0. running mean: -6.838106989115878
resetting env. episode reward total was 3.0. running m

resetting env. episode reward total was -7.0. running mean: -6.597267623141805
resetting env. episode reward total was -10.0. running mean: -6.631294946910387
resetting env. episode reward total was -3.0. running mean: -6.594981997441283
resetting env. episode reward total was -6.0. running mean: -6.58903217746687
resetting env. episode reward total was -2.0. running mean: -6.543141855692201
resetting env. episode reward total was -9.0. running mean: -6.567710437135279
resetting env. episode reward total was -4.0. running mean: -6.542033332763927
resetting env. episode reward total was -5.0. running mean: -6.526612999436287
resetting env. episode reward total was -5.0. running mean: -6.511346869441923
resetting env. episode reward total was -8.0. running mean: -6.526233400747504
resetting env. episode reward total was -12.0. running mean: -6.580971066740029
resetting env. episode reward total was -7.0. running mean: -6.585161356072629
resetting env. episode reward total was -9.0. runni

resetting env. episode reward total was -12.0. running mean: -6.471406301879112
resetting env. episode reward total was -7.0. running mean: -6.476692238860321
resetting env. episode reward total was -6.0. running mean: -6.471925316471717
resetting env. episode reward total was -4.0. running mean: -6.447206063307
resetting env. episode reward total was 2.0. running mean: -6.36273400267393
resetting env. episode reward total was 1.0. running mean: -6.289106662647191
resetting env. episode reward total was -9.0. running mean: -6.316215596020719
resetting env. episode reward total was -2.0. running mean: -6.273053440060511
resetting env. episode reward total was -4.0. running mean: -6.250322905659906
resetting env. episode reward total was -16.0. running mean: -6.347819676603307
resetting env. episode reward total was -3.0. running mean: -6.314341479837275
resetting env. episode reward total was -9.0. running mean: -6.341198065038902
resetting env. episode reward total was -3.0. running me

resetting env. episode reward total was 4.0. running mean: -5.546967894851319
resetting env. episode reward total was 2.0. running mean: -5.471498215902806
resetting env. episode reward total was -11.0. running mean: -5.526783233743778
resetting env. episode reward total was -7.0. running mean: -5.541515401406341
resetting env. episode reward total was -8.0. running mean: -5.566100247392277
resetting env. episode reward total was -11.0. running mean: -5.620439244918354
resetting env. episode reward total was -3.0. running mean: -5.594234852469171
resetting env. episode reward total was 2.0. running mean: -5.51829250394448
resetting env. episode reward total was -7.0. running mean: -5.5331095789050355
resetting env. episode reward total was -8.0. running mean: -5.557778483115985
resetting env. episode reward total was -7.0. running mean: -5.572200698284826
resetting env. episode reward total was -10.0. running mean: -5.616478691301977
resetting env. episode reward total was -3.0. runnin

resetting env. episode reward total was -4.0. running mean: -5.3395884459791905
resetting env. episode reward total was 7.0. running mean: -5.2161925615193985
resetting env. episode reward total was 6.0. running mean: -5.104030635904205
resetting env. episode reward total was -9.0. running mean: -5.142990329545163
resetting env. episode reward total was 3.0. running mean: -5.061560426249711
resetting env. episode reward total was -10.0. running mean: -5.110944821987213
resetting env. episode reward total was -5.0. running mean: -5.109835373767341
resetting env. episode reward total was -5.0. running mean: -5.108737020029667
resetting env. episode reward total was 5.0. running mean: -5.0076496498293706
resetting env. episode reward total was 10.0. running mean: -4.857573153331077
resetting env. episode reward total was -7.0. running mean: -4.8789974217977665
resetting env. episode reward total was -7.0. running mean: -4.900207447579789
resetting env. episode reward total was -4.0. runni

resetting env. episode reward total was -5.0. running mean: -4.76733121293184
resetting env. episode reward total was 6.0. running mean: -4.659657900802522
resetting env. episode reward total was -2.0. running mean: -4.633061321794496
resetting env. episode reward total was -3.0. running mean: -4.616730708576551
resetting env. episode reward total was -2.0. running mean: -4.590563401490785
resetting env. episode reward total was -5.0. running mean: -4.594657767475877
resetting env. episode reward total was -4.0. running mean: -4.588711189801119
resetting env. episode reward total was -8.0. running mean: -4.622824077903108
resetting env. episode reward total was -4.0. running mean: -4.616595837124076
resetting env. episode reward total was -2.0. running mean: -4.590429878752835
resetting env. episode reward total was -12.0. running mean: -4.664525579965307
resetting env. episode reward total was -4.0. running mean: -4.657880324165654
resetting env. episode reward total was -6.0. running

resetting env. episode reward total was -10.0. running mean: -4.317857303821868
resetting env. episode reward total was -12.0. running mean: -4.3946787307836495
resetting env. episode reward total was 1.0. running mean: -4.340731943475813
resetting env. episode reward total was 3.0. running mean: -4.267324624041055
resetting env. episode reward total was -1.0. running mean: -4.234651377800644
resetting env. episode reward total was 2.0. running mean: -4.172304864022638
resetting env. episode reward total was -6.0. running mean: -4.190581815382411
resetting env. episode reward total was -5.0. running mean: -4.1986759972285865
resetting env. episode reward total was -10.0. running mean: -4.2566892372563006
resetting env. episode reward total was 6.0. running mean: -4.154122344883738
resetting env. episode reward total was -4.0. running mean: -4.1525811214349
resetting env. episode reward total was 8.0. running mean: -4.031055310220551
resetting env. episode reward total was 5.0. running 

resetting env. episode reward total was 4.0. running mean: -2.581632758784041
resetting env. episode reward total was -7.0. running mean: -2.6258164311962005
resetting env. episode reward total was -9.0. running mean: -2.689558266884238
resetting env. episode reward total was -3.0. running mean: -2.6926626842153953
resetting env. episode reward total was -10.0. running mean: -2.7657360573732412
resetting env. episode reward total was -5.0. running mean: -2.788078696799509
resetting env. episode reward total was -12.0. running mean: -2.880197909831514
resetting env. episode reward total was 6.0. running mean: -2.791395930733199
resetting env. episode reward total was 3.0. running mean: -2.733481971425867
resetting env. episode reward total was -3.0. running mean: -2.736147151711608
resetting env. episode reward total was -9.0. running mean: -2.798785680194492
resetting env. episode reward total was -11.0. running mean: -2.880797823392547
resetting env. episode reward total was -3.0. run

resetting env. episode reward total was 3.0. running mean: -2.6029846370162177
resetting env. episode reward total was -3.0. running mean: -2.6069547906460553
resetting env. episode reward total was 8.0. running mean: -2.5008852427395944
resetting env. episode reward total was -8.0. running mean: -2.5558763903121986
resetting env. episode reward total was -7.0. running mean: -2.6003176264090766
resetting env. episode reward total was 4.0. running mean: -2.534314450144986
resetting env. episode reward total was -7.0. running mean: -2.578971305643536
resetting env. episode reward total was 5.0. running mean: -2.503181592587101
resetting env. episode reward total was -2.0. running mean: -2.4981497766612297
resetting env. episode reward total was 7.0. running mean: -2.4031682788946176
resetting env. episode reward total was 5.0. running mean: -2.3291365961056716
resetting env. episode reward total was -4.0. running mean: -2.345845230144615
resetting env. episode reward total was -3.0. runn

resetting env. episode reward total was 3.0. running mean: -1.8020955442480493
resetting env. episode reward total was 1.0. running mean: -1.7740745888055687
resetting env. episode reward total was -5.0. running mean: -1.806333842917513
resetting env. episode reward total was 3.0. running mean: -1.758270504488338
resetting env. episode reward total was -15.0. running mean: -1.8906877994434546
resetting env. episode reward total was 2.0. running mean: -1.85178092144902
resetting env. episode reward total was 7.0. running mean: -1.7632631122345297
resetting env. episode reward total was -15.0. running mean: -1.8956304811121842
resetting env. episode reward total was -4.0. running mean: -1.9166741763010624
resetting env. episode reward total was -9.0. running mean: -1.987507434538052
resetting env. episode reward total was 1.0. running mean: -1.9576323601926713
resetting env. episode reward total was -5.0. running mean: -1.9880560365907447
resetting env. episode reward total was -13.0. ru

resetting env. episode reward total was 2.0. running mean: -2.697556701480713
resetting env. episode reward total was 1.0. running mean: -2.660581134465906
resetting env. episode reward total was -5.0. running mean: -2.683975323121247
resetting env. episode reward total was -7.0. running mean: -2.7271355698900344
resetting env. episode reward total was -10.0. running mean: -2.799864214191134
resetting env. episode reward total was -10.0. running mean: -2.8718655720492228
resetting env. episode reward total was -6.0. running mean: -2.9031469163287307
resetting env. episode reward total was -1.0. running mean: -2.8841154471654433
resetting env. episode reward total was -1.0. running mean: -2.8652742926937886
resetting env. episode reward total was -5.0. running mean: -2.8866215497668506
resetting env. episode reward total was -7.0. running mean: -2.927755334269182
resetting env. episode reward total was -2.0. running mean: -2.9184777809264904
resetting env. episode reward total was -4.0.

resetting env. episode reward total was -11.0. running mean: -2.3328654511597695
resetting env. episode reward total was 9.0. running mean: -2.219536796648172
resetting env. episode reward total was -9.0. running mean: -2.28734142868169
resetting env. episode reward total was -4.0. running mean: -2.3044680143948733
resetting env. episode reward total was 3.0. running mean: -2.2514233342509247
resetting env. episode reward total was -8.0. running mean: -2.3089091009084153
resetting env. episode reward total was -9.0. running mean: -2.375820009899331
resetting env. episode reward total was -2.0. running mean: -2.372061809800338
resetting env. episode reward total was 6.0. running mean: -2.2883411917023344
resetting env. episode reward total was 4.0. running mean: -2.225457779785311
resetting env. episode reward total was 2.0. running mean: -2.1832032019874577
resetting env. episode reward total was 7.0. running mean: -2.091371169967583
resetting env. episode reward total was 4.0. running

resetting env. episode reward total was 3.0. running mean: -0.7643843268819642
resetting env. episode reward total was 2.0. running mean: -0.7367404836131446
resetting env. episode reward total was -15.0. running mean: -0.8793730787770132
resetting env. episode reward total was -2.0. running mean: -0.8905793479892431
resetting env. episode reward total was 4.0. running mean: -0.8416735545093507
resetting env. episode reward total was -3.0. running mean: -0.8632568189642572
resetting env. episode reward total was -2.0. running mean: -0.8746242507746147
resetting env. episode reward total was 3.0. running mean: -0.8358780082668685
resetting env. episode reward total was 6.0. running mean: -0.7675192281841998
resetting env. episode reward total was 6.0. running mean: -0.6998440359023579
resetting env. episode reward total was -7.0. running mean: -0.7628455955433342
resetting env. episode reward total was 12.0. running mean: -0.6352171395879008
resetting env. episode reward total was 2.0. 

resetting env. episode reward total was 5.0. running mean: -0.4959055237055841
resetting env. episode reward total was 9.0. running mean: -0.4009464684685282
resetting env. episode reward total was -5.0. running mean: -0.44693700378384293
resetting env. episode reward total was 11.0. running mean: -0.3324676337460045
resetting env. episode reward total was -12.0. running mean: -0.4491429574085445
resetting env. episode reward total was 3.0. running mean: -0.414651527834459
resetting env. episode reward total was 2.0. running mean: -0.3905050125561144
resetting env. episode reward total was -3.0. running mean: -0.4165999624305533
resetting env. episode reward total was -9.0. running mean: -0.5024339628062477
resetting env. episode reward total was -1.0. running mean: -0.5074096231781853
resetting env. episode reward total was 13.0. running mean: -0.3723355269464034
resetting env. episode reward total was -6.0. running mean: -0.4286121716769393
resetting env. episode reward total was -1.

resetting env. episode reward total was -6.0. running mean: -1.0834184777373967
resetting env. episode reward total was -1.0. running mean: -1.0825842929600227
resetting env. episode reward total was -12.0. running mean: -1.1917584500304224
resetting env. episode reward total was -2.0. running mean: -1.199840865530118
resetting env. episode reward total was -12.0. running mean: -1.307842456874817
resetting env. episode reward total was 3.0. running mean: -1.264764032306069
resetting env. episode reward total was -3.0. running mean: -1.2821163919830083
resetting env. episode reward total was -1.0. running mean: -1.2792952280631782
resetting env. episode reward total was 11.0. running mean: -1.1565022757825463
resetting env. episode reward total was -10.0. running mean: -1.244937253024721
resetting env. episode reward total was -1.0. running mean: -1.2424878804944737
resetting env. episode reward total was 5.0. running mean: -1.1800630016895288
resetting env. episode reward total was -5.

resetting env. episode reward total was 9.0. running mean: 0.10071866157559016
resetting env. episode reward total was 8.0. running mean: 0.17971147495983425
resetting env. episode reward total was -2.0. running mean: 0.15791436021023592
resetting env. episode reward total was -3.0. running mean: 0.12633521660813354
resetting env. episode reward total was -5.0. running mean: 0.0750718644420522
resetting env. episode reward total was -6.0. running mean: 0.014321145797631676
resetting env. episode reward total was -5.0. running mean: -0.03582206566034464
resetting env. episode reward total was 14.0. running mean: 0.10453615499625882
resetting env. episode reward total was -4.0. running mean: 0.06349079344629624
resetting env. episode reward total was 9.0. running mean: 0.15285588551183327
resetting env. episode reward total was -9.0. running mean: 0.06132732665671495
resetting env. episode reward total was -6.0. running mean: 0.0007140533901478022
resetting env. episode reward total was 

resetting env. episode reward total was -1.0. running mean: -0.7825026462084786
resetting env. episode reward total was -6.0. running mean: -0.8346776197463939
resetting env. episode reward total was 2.0. running mean: -0.8063308435489299
resetting env. episode reward total was 3.0. running mean: -0.7682675351134406
resetting env. episode reward total was 6.0. running mean: -0.7005848597623061
resetting env. episode reward total was 9.0. running mean: -0.603579011164683
resetting env. episode reward total was 1.0. running mean: -0.5875432210530362
resetting env. episode reward total was -3.0. running mean: -0.6116677888425059
resetting env. episode reward total was 2.0. running mean: -0.5855511109540809
resetting env. episode reward total was 5.0. running mean: -0.52969559984454
resetting env. episode reward total was -13.0. running mean: -0.6543986438460946
resetting env. episode reward total was -6.0. running mean: -0.7078546574076336
resetting env. episode reward total was -6.0. run

resetting env. episode reward total was 3.0. running mean: 0.425397035308932
resetting env. episode reward total was 2.0. running mean: 0.4411430649558427
resetting env. episode reward total was -11.0. running mean: 0.32673163430628427
resetting env. episode reward total was 3.0. running mean: 0.3534643179632214
resetting env. episode reward total was -4.0. running mean: 0.30992967478358924
resetting env. episode reward total was -1.0. running mean: 0.29683037803575335
resetting env. episode reward total was 5.0. running mean: 0.3438620742553958
resetting env. episode reward total was 1.0. running mean: 0.3504234535128418
resetting env. episode reward total was -1.0. running mean: 0.3369192189777134
resetting env. episode reward total was -3.0. running mean: 0.3035500267879362
resetting env. episode reward total was -7.0. running mean: 0.23051452652005683
resetting env. episode reward total was 6.0. running mean: 0.2882093812548563
resetting env. episode reward total was 3.0. running m

resetting env. episode reward total was 3.0. running mean: 0.265440427261775
resetting env. episode reward total was -2.0. running mean: 0.24278602298915727
resetting env. episode reward total was 11.0. running mean: 0.3503581627592657
resetting env. episode reward total was -5.0. running mean: 0.296854581131673
resetting env. episode reward total was -14.0. running mean: 0.15388603532035627
resetting env. episode reward total was -2.0. running mean: 0.13234717496715273
resetting env. episode reward total was -5.0. running mean: 0.08102370321748119
resetting env. episode reward total was -4.0. running mean: 0.04021346618530638
resetting env. episode reward total was 11.0. running mean: 0.14981133152345333
resetting env. episode reward total was 5.0. running mean: 0.1983132182082188
resetting env. episode reward total was -12.0. running mean: 0.07633008602613661
resetting env. episode reward total was -9.0. running mean: -0.014433214834124755
resetting env. episode reward total was 15.0

resetting env. episode reward total was 5.0. running mean: -0.31916536299230164
resetting env. episode reward total was 4.0. running mean: -0.27597370936237864
resetting env. episode reward total was -1.0. running mean: -0.28321397226875483
resetting env. episode reward total was -13.0. running mean: -0.4103818325460673
resetting env. episode reward total was -7.0. running mean: -0.4762780142206066
resetting env. episode reward total was -3.0. running mean: -0.5015152340784006
resetting env. episode reward total was 4.0. running mean: -0.4565000817376166
resetting env. episode reward total was -11.0. running mean: -0.5619350809202405
resetting env. episode reward total was -4.0. running mean: -0.5963157301110381
resetting env. episode reward total was 5.0. running mean: -0.5403525728099277
resetting env. episode reward total was 8.0. running mean: -0.4549490470818283
resetting env. episode reward total was -5.0. running mean: -0.5003995566110101
resetting env. episode reward total was 

resetting env. episode reward total was 9.0. running mean: 0.22023017295443068
resetting env. episode reward total was 6.0. running mean: 0.27802787122488637
resetting env. episode reward total was -7.0. running mean: 0.20524759251263752
resetting env. episode reward total was -13.0. running mean: 0.07319511658751113
resetting env. episode reward total was 13.0. running mean: 0.202463165421636
resetting env. episode reward total was 6.0. running mean: 0.26043853376741966
resetting env. episode reward total was -4.0. running mean: 0.21783414842974544
resetting env. episode reward total was 4.0. running mean: 0.255655806945448
resetting env. episode reward total was 3.0. running mean: 0.28309924887599347
resetting env. episode reward total was 1.0. running mean: 0.2902682563872335
resetting env. episode reward total was 4.0. running mean: 0.3273655738233612
resetting env. episode reward total was -3.0. running mean: 0.2940919180851276
resetting env. episode reward total was -6.0. running

resetting env. episode reward total was 1.0. running mean: 0.4868635340944833
resetting env. episode reward total was -11.0. running mean: 0.37199489875353847
resetting env. episode reward total was 1.0. running mean: 0.37827494976600307
resetting env. episode reward total was -5.0. running mean: 0.324492200268343
resetting env. episode reward total was 4.0. running mean: 0.36124727826565955
resetting env. episode reward total was 7.0. running mean: 0.427634805483003
resetting env. episode reward total was 7.0. running mean: 0.49335845742817297
resetting env. episode reward total was -9.0. running mean: 0.3984248728538913
resetting env. episode reward total was -5.0. running mean: 0.34444062412535237
resetting env. episode reward total was 1.0. running mean: 0.35099621788409885
resetting env. episode reward total was 8.0. running mean: 0.4274862557052579
resetting env. episode reward total was 10.0. running mean: 0.5232113931482053
resetting env. episode reward total was -1.0. running 

resetting env. episode reward total was -4.0. running mean: 0.19895887234198675
resetting env. episode reward total was -2.0. running mean: 0.1769692836185669
resetting env. episode reward total was 8.0. running mean: 0.25519959078238125
resetting env. episode reward total was 3.0. running mean: 0.2826475948745575
resetting env. episode reward total was -1.0. running mean: 0.2698211189258119
resetting env. episode reward total was 2.0. running mean: 0.2871229077365538
resetting env. episode reward total was -4.0. running mean: 0.24425167865918826
resetting env. episode reward total was -9.0. running mean: 0.15180916187259638
resetting env. episode reward total was -3.0. running mean: 0.12029107025387042
resetting env. episode reward total was 4.0. running mean: 0.1590881595513317
resetting env. episode reward total was 2.0. running mean: 0.17749727795581838
resetting env. episode reward total was 11.0. running mean: 0.2857223051762602
resetting env. episode reward total was -1.0. runni

resetting env. episode reward total was -1.0. running mean: 0.9214684289372455
resetting env. episode reward total was -3.0. running mean: 0.882253744647873
resetting env. episode reward total was -9.0. running mean: 0.7834312072013943
resetting env. episode reward total was -5.0. running mean: 0.7255968951293803
resetting env. episode reward total was 6.0. running mean: 0.7783409261780865
resetting env. episode reward total was 6.0. running mean: 0.8305575169163055
resetting env. episode reward total was 4.0. running mean: 0.8622519417471425
resetting env. episode reward total was -1.0. running mean: 0.8436294223296711
resetting env. episode reward total was -5.0. running mean: 0.7851931281063743
resetting env. episode reward total was -2.0. running mean: 0.7573411968253105
resetting env. episode reward total was 9.0. running mean: 0.8397677848570574
resetting env. episode reward total was -13.0. running mean: 0.7013701070084868
resetting env. episode reward total was 6.0. running mea

resetting env. episode reward total was 8.0. running mean: 1.37340859137482
resetting env. episode reward total was -2.0. running mean: 1.3396745054610717
resetting env. episode reward total was -7.0. running mean: 1.256277760406461
resetting env. episode reward total was 7.0. running mean: 1.3137149828023964
resetting env. episode reward total was -4.0. running mean: 1.2605778329743724
resetting env. episode reward total was 4.0. running mean: 1.2879720546446287
resetting env. episode reward total was -2.0. running mean: 1.2550923340981823
resetting env. episode reward total was 3.0. running mean: 1.2725414107572006
resetting env. episode reward total was 3.0. running mean: 1.2898159966496285
resetting env. episode reward total was 1.0. running mean: 1.2869178366831322
resetting env. episode reward total was -5.0. running mean: 1.2240486583163008
resetting env. episode reward total was -3.0. running mean: 1.1818081717331377
resetting env. episode reward total was -1.0. running mean: 1

resetting env. episode reward total was -11.0. running mean: 1.55803583909307
resetting env. episode reward total was 12.0. running mean: 1.6624554807021394
resetting env. episode reward total was 11.0. running mean: 1.7558309258951181
resetting env. episode reward total was 8.0. running mean: 1.818272616636167
resetting env. episode reward total was 7.0. running mean: 1.8700898904698053
resetting env. episode reward total was -5.0. running mean: 1.8013889915651071
resetting env. episode reward total was -3.0. running mean: 1.753375101649456
resetting env. episode reward total was -1.0. running mean: 1.7258413506329615
resetting env. episode reward total was 7.0. running mean: 1.778582937126632
resetting env. episode reward total was 4.0. running mean: 1.8007971077553657
resetting env. episode reward total was 6.0. running mean: 1.8427891366778122
resetting env. episode reward total was 4.0. running mean: 1.864361245311034
resetting env. episode reward total was 11.0. running mean: 1.9

resetting env. episode reward total was -4.0. running mean: 1.5047216504352723
resetting env. episode reward total was -8.0. running mean: 1.4096744339309195
resetting env. episode reward total was 3.0. running mean: 1.4255776895916104
resetting env. episode reward total was 5.0. running mean: 1.4613219126956942
resetting env. episode reward total was 12.0. running mean: 1.5667086935687373
resetting env. episode reward total was 8.0. running mean: 1.63104160663305
resetting env. episode reward total was -3.0. running mean: 1.5847311905667194
resetting env. episode reward total was 11.0. running mean: 1.6788838786610523
resetting env. episode reward total was 5.0. running mean: 1.7120950398744417
resetting env. episode reward total was 5.0. running mean: 1.7449740894756973
resetting env. episode reward total was 2.0. running mean: 1.7475243485809404
resetting env. episode reward total was -2.0. running mean: 1.710049105095131
resetting env. episode reward total was 3.0. running mean: 1.

resetting env. episode reward total was -3.0. running mean: 1.5214230748254671
resetting env. episode reward total was 10.0. running mean: 1.6062088440772124
resetting env. episode reward total was 4.0. running mean: 1.6301467556364404
resetting env. episode reward total was 11.0. running mean: 1.723845288080076
resetting env. episode reward total was -9.0. running mean: 1.6166068351992753
resetting env. episode reward total was 1.0. running mean: 1.6104407668472824
resetting env. episode reward total was -3.0. running mean: 1.5643363591788095
resetting env. episode reward total was -6.0. running mean: 1.4886929955870214
resetting env. episode reward total was -1.0. running mean: 1.4638060656311511
resetting env. episode reward total was -1.0. running mean: 1.4391680049748397
resetting env. episode reward total was 13.0. running mean: 1.5547763249250912
resetting env. episode reward total was -11.0. running mean: 1.4292285616758402
resetting env. episode reward total was 6.0. running m

resetting env. episode reward total was -9.0. running mean: 1.8618313576884487
resetting env. episode reward total was 7.0. running mean: 1.9132130441115642
resetting env. episode reward total was 10.0. running mean: 1.9940809136704487
resetting env. episode reward total was 9.0. running mean: 2.064140104533744
resetting env. episode reward total was -2.0. running mean: 2.0234987034884067
resetting env. episode reward total was 9.0. running mean: 2.0932637164535226
resetting env. episode reward total was -13.0. running mean: 1.9423310792889876
resetting env. episode reward total was -5.0. running mean: 1.8729077684960977
resetting env. episode reward total was -11.0. running mean: 1.7441786908111367
resetting env. episode reward total was -4.0. running mean: 1.6867369039030253
resetting env. episode reward total was -15.0. running mean: 1.5198695348639952
resetting env. episode reward total was -4.0. running mean: 1.4646708395153551
resetting env. episode reward total was 3.0. running 

resetting env. episode reward total was 5.0. running mean: 1.654380394366489
resetting env. episode reward total was 9.0. running mean: 1.7278365904228241
resetting env. episode reward total was 11.0. running mean: 1.820558224518596
resetting env. episode reward total was 6.0. running mean: 1.8623526422734102
resetting env. episode reward total was 7.0. running mean: 1.9137291158506762
resetting env. episode reward total was 5.0. running mean: 1.9445918246921694
resetting env. episode reward total was -3.0. running mean: 1.8951459064452476
resetting env. episode reward total was -2.0. running mean: 1.8561944473807952
resetting env. episode reward total was 1.0. running mean: 1.8476325029069873
resetting env. episode reward total was -1.0. running mean: 1.8191561778779175
resetting env. episode reward total was -3.0. running mean: 1.7709646160991384
resetting env. episode reward total was -12.0. running mean: 1.6332549699381471
resetting env. episode reward total was 7.0. running mean: 

resetting env. episode reward total was -3.0. running mean: 1.5135314972149274
resetting env. episode reward total was 13.0. running mean: 1.6283961822427782
resetting env. episode reward total was -1.0. running mean: 1.6021122204203504
resetting env. episode reward total was -4.0. running mean: 1.5460910982161469
resetting env. episode reward total was -6.0. running mean: 1.4706301872339853
resetting env. episode reward total was 8.0. running mean: 1.5359238853616455
resetting env. episode reward total was -1.0. running mean: 1.510564646508029
resetting env. episode reward total was 11.0. running mean: 1.6054590000429487
resetting env. episode reward total was 2.0. running mean: 1.6094044100425193
resetting env. episode reward total was 10.0. running mean: 1.6933103659420943
resetting env. episode reward total was -2.0. running mean: 1.6563772622826733
resetting env. episode reward total was 12.0. running mean: 1.7598134896598467
resetting env. episode reward total was -2.0. running m

resetting env. episode reward total was 9.0. running mean: 1.4470111948425761
resetting env. episode reward total was 7.0. running mean: 1.5025410828941503
resetting env. episode reward total was -3.0. running mean: 1.4575156720652087
resetting env. episode reward total was -9.0. running mean: 1.3529405153445566
resetting env. episode reward total was 2.0. running mean: 1.359411110191111
resetting env. episode reward total was 3.0. running mean: 1.3758169990892
resetting env. episode reward total was 3.0. running mean: 1.392058829098308
resetting env. episode reward total was -5.0. running mean: 1.3281382408073248
resetting env. episode reward total was -17.0. running mean: 1.1448568583992516
resetting env. episode reward total was 12.0. running mean: 1.2534082898152592
resetting env. episode reward total was 5.0. running mean: 1.2908742069171066
resetting env. episode reward total was -5.0. running mean: 1.2279654648479355
resetting env. episode reward total was 12.0. running mean: 1.

resetting env. episode reward total was 14.0. running mean: 3.4985696952719083
resetting env. episode reward total was 7.0. running mean: 3.5335839983191892
resetting env. episode reward total was -1.0. running mean: 3.4882481583359977
resetting env. episode reward total was 2.0. running mean: 3.4733656767526377
resetting env. episode reward total was 6.0. running mean: 3.4986320199851115
resetting env. episode reward total was 2.0. running mean: 3.4836456997852605
resetting env. episode reward total was 6.0. running mean: 3.508809242787408
resetting env. episode reward total was -2.0. running mean: 3.4537211503595335
resetting env. episode reward total was -7.0. running mean: 3.3491839388559383
resetting env. episode reward total was -6.0. running mean: 3.255692099467379
resetting env. episode reward total was 5.0. running mean: 3.273135178472705
resetting env. episode reward total was -5.0. running mean: 3.190403826687978
resetting env. episode reward total was 2.0. running mean: 3.1

resetting env. episode reward total was 11.0. running mean: 3.716432678011521
resetting env. episode reward total was -4.0. running mean: 3.6392683512314057
resetting env. episode reward total was 15.0. running mean: 3.7528756677190915
resetting env. episode reward total was 2.0. running mean: 3.7353469110419004
resetting env. episode reward total was 5.0. running mean: 3.7479934419314813
resetting env. episode reward total was -6.0. running mean: 3.6505135075121666
resetting env. episode reward total was 6.0. running mean: 3.674008372437045
resetting env. episode reward total was 5.0. running mean: 3.687268288712674
resetting env. episode reward total was 14.0. running mean: 3.7903956058255477
resetting env. episode reward total was -11.0. running mean: 3.642491649767292
resetting env. episode reward total was 10.0. running mean: 3.706066733269619
resetting env. episode reward total was 10.0. running mean: 3.769006065936923
resetting env. episode reward total was 2.0. running mean: 3.

resetting env. episode reward total was 4.0. running mean: 4.0700786772313435
resetting env. episode reward total was 6.0. running mean: 4.08937789045903
resetting env. episode reward total was 11.0. running mean: 4.15848411155444
resetting env. episode reward total was 4.0. running mean: 4.156899270438895
resetting env. episode reward total was 7.0. running mean: 4.185330277734507
resetting env. episode reward total was 7.0. running mean: 4.213476974957162
resetting env. episode reward total was 5.0. running mean: 4.22134220520759
resetting env. episode reward total was 9.0. running mean: 4.269128783155514
resetting env. episode reward total was 12.0. running mean: 4.346437495323959
resetting env. episode reward total was -5.0. running mean: 4.2529731203707195
resetting env. episode reward total was -4.0. running mean: 4.170443389167012
resetting env. episode reward total was 7.0. running mean: 4.198738955275342
resetting env. episode reward total was 10.0. running mean: 4.25675156572

resetting env. episode reward total was 9.0. running mean: 3.863008098417561
resetting env. episode reward total was 15.0. running mean: 3.974378017433385
resetting env. episode reward total was 4.0. running mean: 3.9746342372590515
resetting env. episode reward total was 7.0. running mean: 4.004887894886461
resetting env. episode reward total was 4.0. running mean: 4.004839015937597
resetting env. episode reward total was -2.0. running mean: 3.9447906257782206
resetting env. episode reward total was 8.0. running mean: 3.9853427195204385
resetting env. episode reward total was -3.0. running mean: 3.9154892923252342
resetting env. episode reward total was 10.0. running mean: 3.9763343994019817
resetting env. episode reward total was 3.0. running mean: 3.966571055407962
resetting env. episode reward total was 10.0. running mean: 4.026905344853882
resetting env. episode reward total was 7.0. running mean: 4.056636291405343
resetting env. episode reward total was -2.0. running mean: 3.9960

resetting env. episode reward total was -3.0. running mean: 4.578306588697164
resetting env. episode reward total was 12.0. running mean: 4.652523522810193
resetting env. episode reward total was 4.0. running mean: 4.645998287582091
resetting env. episode reward total was 6.0. running mean: 4.6595383047062695
resetting env. episode reward total was 9.0. running mean: 4.702942921659207
resetting env. episode reward total was 6.0. running mean: 4.715913492442614
resetting env. episode reward total was 12.0. running mean: 4.788754357518188
resetting env. episode reward total was 1.0. running mean: 4.750866813943007
resetting env. episode reward total was 13.0. running mean: 4.833358145803577
resetting env. episode reward total was -4.0. running mean: 4.745024564345541
resetting env. episode reward total was 5.0. running mean: 4.747574318702085
resetting env. episode reward total was -11.0. running mean: 4.590098575515064
resetting env. episode reward total was 5.0. running mean: 4.5941975

resetting env. episode reward total was 10.0. running mean: 4.5822791903585705
resetting env. episode reward total was 2.0. running mean: 4.556456398454984
resetting env. episode reward total was 6.0. running mean: 4.570891834470435
resetting env. episode reward total was 5.0. running mean: 4.57518291612573
resetting env. episode reward total was 9.0. running mean: 4.619431086964473
resetting env. episode reward total was 5.0. running mean: 4.623236776094828
resetting env. episode reward total was 5.0. running mean: 4.627004408333879
resetting env. episode reward total was 1.0. running mean: 4.59073436425054
resetting env. episode reward total was 8.0. running mean: 4.624827020608035
resetting env. episode reward total was 11.0. running mean: 4.688578750401955
resetting env. episode reward total was 3.0. running mean: 4.671692962897936
resetting env. episode reward total was -5.0. running mean: 4.574976033268957
resetting env. episode reward total was 8.0. running mean: 4.6092262729362

resetting env. episode reward total was 10.0. running mean: 3.0476077728232203
resetting env. episode reward total was 2.0. running mean: 3.0371316950949883
resetting env. episode reward total was 12.0. running mean: 3.1267603781440383
resetting env. episode reward total was 10.0. running mean: 3.195492774362598
resetting env. episode reward total was 5.0. running mean: 3.2135378466189715
resetting env. episode reward total was 2.0. running mean: 3.2014024681527817
resetting env. episode reward total was 6.0. running mean: 3.229388443471254
resetting env. episode reward total was -6.0. running mean: 3.1370945590365413
resetting env. episode reward total was 10.0. running mean: 3.205723613446176
resetting env. episode reward total was -7.0. running mean: 3.1036663773117144
resetting env. episode reward total was 1.0. running mean: 3.082629713538597
resetting env. episode reward total was 13.0. running mean: 3.181803416403211
resetting env. episode reward total was 12.0. running mean: 3.

resetting env. episode reward total was 4.0. running mean: 4.721303580204623
resetting env. episode reward total was 14.0. running mean: 4.8140905444025766
resetting env. episode reward total was 11.0. running mean: 4.875949638958551
resetting env. episode reward total was 2.0. running mean: 4.847190142568965
resetting env. episode reward total was 14.0. running mean: 4.938718241143275
resetting env. episode reward total was 8.0. running mean: 4.969331058731842
resetting env. episode reward total was 12.0. running mean: 5.039637748144524
resetting env. episode reward total was 2.0. running mean: 5.009241370663078
resetting env. episode reward total was 10.0. running mean: 5.059148956956447
resetting env. episode reward total was 2.0. running mean: 5.028557467386882
resetting env. episode reward total was 5.0. running mean: 5.028271892713013
resetting env. episode reward total was 10.0. running mean: 5.077989173785882
resetting env. episode reward total was 10.0. running mean: 5.1272092

resetting env. episode reward total was 1.0. running mean: 4.423300766164096
resetting env. episode reward total was 2.0. running mean: 4.399067758502454
resetting env. episode reward total was -1.0. running mean: 4.34507708091743
resetting env. episode reward total was -5.0. running mean: 4.251626310108255
resetting env. episode reward total was 14.0. running mean: 4.349110047007173
resetting env. episode reward total was 6.0. running mean: 4.365618946537101
resetting env. episode reward total was 6.0. running mean: 4.381962757071729
resetting env. episode reward total was 4.0. running mean: 4.378143129501012
resetting env. episode reward total was 5.0. running mean: 4.384361698206002
resetting env. episode reward total was 2.0. running mean: 4.360518081223941
resetting env. episode reward total was 8.0. running mean: 4.396912900411702
resetting env. episode reward total was 9.0. running mean: 4.442943771407585
resetting env. episode reward total was 5.0. running mean: 4.4485143336935

resetting env. episode reward total was 11.0. running mean: 4.737999282478931
resetting env. episode reward total was 7.0. running mean: 4.760619289654142
resetting env. episode reward total was -1.0. running mean: 4.7030130967576005
resetting env. episode reward total was 9.0. running mean: 4.7459829657900245
resetting env. episode reward total was -8.0. running mean: 4.618523136132124
resetting env. episode reward total was 3.0. running mean: 4.602337904770803
resetting env. episode reward total was -1.0. running mean: 4.546314525723095
resetting env. episode reward total was 1.0. running mean: 4.510851380465864
resetting env. episode reward total was 9.0. running mean: 4.555742866661205
resetting env. episode reward total was 4.0. running mean: 4.550185437994593
resetting env. episode reward total was -1.0. running mean: 4.4946835836146475
resetting env. episode reward total was 4.0. running mean: 4.489736747778501
resetting env. episode reward total was 4.0. running mean: 4.4848393

resetting env. episode reward total was 2.0. running mean: 4.582858641727615
resetting env. episode reward total was 6.0. running mean: 4.597030055310339
resetting env. episode reward total was 2.0. running mean: 4.5710597547572345
resetting env. episode reward total was -5.0. running mean: 4.475349157209663
resetting env. episode reward total was 7.0. running mean: 4.500595665637566
resetting env. episode reward total was 4.0. running mean: 4.49558970898119
resetting env. episode reward total was 12.0. running mean: 4.570633811891378
resetting env. episode reward total was 1.0. running mean: 4.534927473772464
resetting env. episode reward total was 9.0. running mean: 4.579578199034739
resetting env. episode reward total was -7.0. running mean: 4.463782417044391
resetting env. episode reward total was 9.0. running mean: 4.509144592873947
resetting env. episode reward total was 16.0. running mean: 4.624053146945208
resetting env. episode reward total was 17.0. running mean: 4.7478126154

resetting env. episode reward total was 4.0. running mean: 5.643017190011616
resetting env. episode reward total was 2.0. running mean: 5.606587018111499
resetting env. episode reward total was 1.0. running mean: 5.560521147930384
resetting env. episode reward total was 8.0. running mean: 5.58491593645108
resetting env. episode reward total was 1.0. running mean: 5.5390667770865685
resetting env. episode reward total was 7.0. running mean: 5.553676109315703
resetting env. episode reward total was 3.0. running mean: 5.528139348222546
resetting env. episode reward total was 2.0. running mean: 5.4928579547403205
resetting env. episode reward total was 4.0. running mean: 5.477929375192917
resetting env. episode reward total was 1.0. running mean: 5.433150081440988
resetting env. episode reward total was 3.0. running mean: 5.408818580626578
resetting env. episode reward total was 1.0. running mean: 5.364730394820312
resetting env. episode reward total was 18.0. running mean: 5.4910830908721

resetting env. episode reward total was 5.0. running mean: 5.386739834166132
resetting env. episode reward total was 6.0. running mean: 5.39287243582447
resetting env. episode reward total was 7.0. running mean: 5.408943711466226
resetting env. episode reward total was -8.0. running mean: 5.274854274351564
resetting env. episode reward total was 11.0. running mean: 5.332105731608049
resetting env. episode reward total was 12.0. running mean: 5.398784674291969
resetting env. episode reward total was 2.0. running mean: 5.364796827549049
resetting env. episode reward total was 8.0. running mean: 5.391148859273558
resetting env. episode reward total was 6.0. running mean: 5.3972373706808225
resetting env. episode reward total was 7.0. running mean: 5.413264996974014
resetting env. episode reward total was -1.0. running mean: 5.349132347004274
resetting env. episode reward total was 6.0. running mean: 5.355641023534231
resetting env. episode reward total was 11.0. running mean: 5.4120846132

resetting env. episode reward total was 7.0. running mean: 5.044744015747235
resetting env. episode reward total was 2.0. running mean: 5.014296575589762
resetting env. episode reward total was 15.0. running mean: 5.114153609833865
resetting env. episode reward total was 2.0. running mean: 5.083012073735526
resetting env. episode reward total was -3.0. running mean: 5.00218195299817
resetting env. episode reward total was 13.0. running mean: 5.0821601334681885
resetting env. episode reward total was 1.0. running mean: 5.041338532133507
resetting env. episode reward total was 13.0. running mean: 5.120925146812171
resetting env. episode reward total was 13.0. running mean: 5.199715895344049
resetting env. episode reward total was 9.0. running mean: 5.237718736390608
resetting env. episode reward total was 9.0. running mean: 5.2753415490267015
resetting env. episode reward total was 10.0. running mean: 5.322588133536434
resetting env. episode reward total was -7.0. running mean: 5.1993622

resetting env. episode reward total was 4.0. running mean: 5.52265790488037
resetting env. episode reward total was 2.0. running mean: 5.487431325831566
resetting env. episode reward total was 4.0. running mean: 5.47255701257325
resetting env. episode reward total was 16.0. running mean: 5.5778314424475175
resetting env. episode reward total was 10.0. running mean: 5.622053128023042
resetting env. episode reward total was 5.0. running mean: 5.615832596742812
resetting env. episode reward total was 3.0. running mean: 5.589674270775384
resetting env. episode reward total was 6.0. running mean: 5.59377752806763
resetting env. episode reward total was 13.0. running mean: 5.6678397527869535
resetting env. episode reward total was 3.0. running mean: 5.641161355259084
resetting env. episode reward total was 6.0. running mean: 5.644749741706493
resetting env. episode reward total was 7.0. running mean: 5.658302244289428
resetting env. episode reward total was -2.0. running mean: 5.581719221846

resetting env. episode reward total was 6.0. running mean: 5.363467850712374
resetting env. episode reward total was 6.0. running mean: 5.36983317220525
resetting env. episode reward total was 8.0. running mean: 5.396134840483198
resetting env. episode reward total was 2.0. running mean: 5.362173492078365
resetting env. episode reward total was 3.0. running mean: 5.338551757157582
resetting env. episode reward total was 2.0. running mean: 5.305166239586005
resetting env. episode reward total was 8.0. running mean: 5.332114577190145
resetting env. episode reward total was -1.0. running mean: 5.268793431418244
resetting env. episode reward total was 2.0. running mean: 5.236105497104061
resetting env. episode reward total was 16.0. running mean: 5.343744442133021
resetting env. episode reward total was 7.0. running mean: 5.360306997711691
resetting env. episode reward total was 12.0. running mean: 5.426703927734573
resetting env. episode reward total was 3.0. running mean: 5.4024368884572

resetting env. episode reward total was 8.0. running mean: 5.308979480349545
resetting env. episode reward total was 5.0. running mean: 5.305889685546049
resetting env. episode reward total was 11.0. running mean: 5.362830788690589
resetting env. episode reward total was 2.0. running mean: 5.329202480803683
resetting env. episode reward total was 4.0. running mean: 5.315910455995645
resetting env. episode reward total was 10.0. running mean: 5.362751351435689
resetting env. episode reward total was 4.0. running mean: 5.349123837921332
resetting env. episode reward total was -2.0. running mean: 5.275632599542119
resetting env. episode reward total was 13.0. running mean: 5.3528762735466975
resetting env. episode reward total was 2.0. running mean: 5.31934751081123
resetting env. episode reward total was 2.0. running mean: 5.286154035703118
resetting env. episode reward total was 4.0. running mean: 5.2732924953460865
resetting env. episode reward total was 4.0. running mean: 5.2605595703

resetting env. episode reward total was 6.0. running mean: 4.370396763329991
resetting env. episode reward total was 5.0. running mean: 4.376692795696691
resetting env. episode reward total was 5.0. running mean: 4.382925867739724
resetting env. episode reward total was 1.0. running mean: 4.349096609062326
resetting env. episode reward total was 14.0. running mean: 4.445605642971703
resetting env. episode reward total was 4.0. running mean: 4.441149586541986
resetting env. episode reward total was 9.0. running mean: 4.486738090676566
resetting env. episode reward total was 9.0. running mean: 4.5318707097698
resetting env. episode reward total was 5.0. running mean: 4.536552002672102
resetting env. episode reward total was 1.0. running mean: 4.501186482645381
resetting env. episode reward total was 3.0. running mean: 4.486174617818928
resetting env. episode reward total was 15.0. running mean: 4.5913128716407385
resetting env. episode reward total was 1.0. running mean: 4.55539974292433

resetting env. episode reward total was 11.0. running mean: 5.016872033413388
resetting env. episode reward total was -2.0. running mean: 4.946703313079254
resetting env. episode reward total was 7.0. running mean: 4.967236279948462
resetting env. episode reward total was 9.0. running mean: 5.007563917148977
resetting env. episode reward total was 10.0. running mean: 5.0574882779774875
resetting env. episode reward total was 1.0. running mean: 5.0169133951977125
resetting env. episode reward total was 8.0. running mean: 5.046744261245736
resetting env. episode reward total was 14.0. running mean: 5.136276818633278
resetting env. episode reward total was 15.0. running mean: 5.234914050446946
resetting env. episode reward total was 5.0. running mean: 5.232564909942476
resetting env. episode reward total was -9.0. running mean: 5.090239260843052
resetting env. episode reward total was 10.0. running mean: 5.13933686823462
resetting env. episode reward total was 11.0. running mean: 5.197943

resetting env. episode reward total was 7.0. running mean: 5.084199599627388
resetting env. episode reward total was 8.0. running mean: 5.113357603631114
resetting env. episode reward total was 3.0. running mean: 5.092224027594803
resetting env. episode reward total was -1.0. running mean: 5.031301787318855
resetting env. episode reward total was 7.0. running mean: 5.0509887694456665
resetting env. episode reward total was 2.0. running mean: 5.02047888175121
resetting env. episode reward total was 10.0. running mean: 5.070274092933697
resetting env. episode reward total was -9.0. running mean: 4.92957135200436
resetting env. episode reward total was 6.0. running mean: 4.940275638484316
resetting env. episode reward total was -7.0. running mean: 4.820872882099472
resetting env. episode reward total was -3.0. running mean: 4.7426641532784775
resetting env. episode reward total was 12.0. running mean: 4.815237511745693
resetting env. episode reward total was 4.0. running mean: 4.807085136

resetting env. episode reward total was 4.0. running mean: 5.044786165403395
resetting env. episode reward total was 1.0. running mean: 5.004338303749361
resetting env. episode reward total was -5.0. running mean: 4.904294920711868
resetting env. episode reward total was 3.0. running mean: 4.885251971504749
resetting env. episode reward total was 7.0. running mean: 4.906399451789702
resetting env. episode reward total was 6.0. running mean: 4.917335457271805
resetting env. episode reward total was -1.0. running mean: 4.858162102699087
resetting env. episode reward total was 8.0. running mean: 4.889580481672096
resetting env. episode reward total was 5.0. running mean: 4.890684676855375
resetting env. episode reward total was 8.0. running mean: 4.921777830086821
resetting env. episode reward total was 2.0. running mean: 4.892560051785953
resetting env. episode reward total was 17.0. running mean: 5.013634451268093
resetting env. episode reward total was 7.0. running mean: 5.033498106755

resetting env. episode reward total was 5.0. running mean: 5.6149113249826526
resetting env. episode reward total was 5.0. running mean: 5.6087622117328255
resetting env. episode reward total was -9.0. running mean: 5.462674589615498
resetting env. episode reward total was -6.0. running mean: 5.348047843719343
resetting env. episode reward total was 7.0. running mean: 5.3645673652821495
resetting env. episode reward total was 2.0. running mean: 5.330921691629327
resetting env. episode reward total was 9.0. running mean: 5.367612474713034
resetting env. episode reward total was 1.0. running mean: 5.323936349965903
resetting env. episode reward total was 11.0. running mean: 5.380696986466244
resetting env. episode reward total was 6.0. running mean: 5.386890016601582
resetting env. episode reward total was 6.0. running mean: 5.393021116435565
resetting env. episode reward total was 6.0. running mean: 5.399090905271209
resetting env. episode reward total was 13.0. running mean: 5.47509999

resetting env. episode reward total was 4.0. running mean: 5.445233215663696
resetting env. episode reward total was -9.0. running mean: 5.300780883507059
resetting env. episode reward total was 8.0. running mean: 5.327773074671988
resetting env. episode reward total was 4.0. running mean: 5.314495343925269
resetting env. episode reward total was 1.0. running mean: 5.271350390486016
resetting env. episode reward total was 9.0. running mean: 5.308636886581156
resetting env. episode reward total was -6.0. running mean: 5.195550517715345
resetting env. episode reward total was 8.0. running mean: 5.223595012538191
resetting env. episode reward total was -5.0. running mean: 5.121359062412809
resetting env. episode reward total was -3.0. running mean: 5.040145471788681
resetting env. episode reward total was 9.0. running mean: 5.0797440170707935
resetting env. episode reward total was 8.0. running mean: 5.1089465769000855
resetting env. episode reward total was -1.0. running mean: 5.04785711

resetting env. episode reward total was 9.0. running mean: 5.5962127180740735
resetting env. episode reward total was 4.0. running mean: 5.580250590893333
resetting env. episode reward total was 2.0. running mean: 5.544448084984399
resetting env. episode reward total was 15.0. running mean: 5.6390036041345555
resetting env. episode reward total was 10.0. running mean: 5.68261356809321
resetting env. episode reward total was 8.0. running mean: 5.7057874324122775
resetting env. episode reward total was -2.0. running mean: 5.628729558088155
resetting env. episode reward total was 5.0. running mean: 5.622442262507273
resetting env. episode reward total was 11.0. running mean: 5.6762178398822005
resetting env. episode reward total was 9.0. running mean: 5.7094556614833785
resetting env. episode reward total was 12.0. running mean: 5.772361104868545
resetting env. episode reward total was 2.0. running mean: 5.734637493819859
resetting env. episode reward total was 4.0. running mean: 5.717291

resetting env. episode reward total was 17.0. running mean: 5.899578616324633
resetting env. episode reward total was 11.0. running mean: 5.9505828301613874
resetting env. episode reward total was -2.0. running mean: 5.871077001859774
resetting env. episode reward total was 5.0. running mean: 5.862366231841176
resetting env. episode reward total was 1.0. running mean: 5.8137425695227645
resetting env. episode reward total was 1.0. running mean: 5.765605143827536
resetting env. episode reward total was 2.0. running mean: 5.72794909238926
resetting env. episode reward total was 3.0. running mean: 5.700669601465368
resetting env. episode reward total was 11.0. running mean: 5.753662905450715
resetting env. episode reward total was -1.0. running mean: 5.686126276396208
resetting env. episode reward total was 2.0. running mean: 5.649265013632245
resetting env. episode reward total was 6.0. running mean: 5.652772363495923
resetting env. episode reward total was 9.0. running mean: 5.686244639

resetting env. episode reward total was 8.0. running mean: 6.678016080449679
resetting env. episode reward total was 16.0. running mean: 6.771235919645183
resetting env. episode reward total was -1.0. running mean: 6.693523560448732
resetting env. episode reward total was 8.0. running mean: 6.706588324844245
resetting env. episode reward total was 12.0. running mean: 6.759522441595802
resetting env. episode reward total was 4.0. running mean: 6.7319272171798445
resetting env. episode reward total was 8.0. running mean: 6.744607945008046
resetting env. episode reward total was -4.0. running mean: 6.637161865557966
resetting env. episode reward total was 5.0. running mean: 6.620790246902386
resetting env. episode reward total was 8.0. running mean: 6.634582344433362
resetting env. episode reward total was 11.0. running mean: 6.678236520989029
resetting env. episode reward total was 9.0. running mean: 6.701454155779138
resetting env. episode reward total was 5.0. running mean: 6.684439614

resetting env. episode reward total was 9.0. running mean: 6.176875897566445
resetting env. episode reward total was -4.0. running mean: 6.075107138590781
resetting env. episode reward total was 13.0. running mean: 6.144356067204873
resetting env. episode reward total was 12.0. running mean: 6.202912506532824
resetting env. episode reward total was -6.0. running mean: 6.080883381467496
resetting env. episode reward total was 8.0. running mean: 6.100074547652821
resetting env. episode reward total was 7.0. running mean: 6.109073802176293
resetting env. episode reward total was 11.0. running mean: 6.15798306415453
resetting env. episode reward total was 3.0. running mean: 6.126403233512986
resetting env. episode reward total was 5.0. running mean: 6.115139201177856
resetting env. episode reward total was 5.0. running mean: 6.103987809166077
resetting env. episode reward total was 13.0. running mean: 6.172947931074416
resetting env. episode reward total was 10.0. running mean: 6.211218451

resetting env. episode reward total was 5.0. running mean: 6.082035030449659
resetting env. episode reward total was 2.0. running mean: 6.041214680145162
resetting env. episode reward total was -9.0. running mean: 5.89080253334371
resetting env. episode reward total was 8.0. running mean: 5.911894508010273
resetting env. episode reward total was 5.0. running mean: 5.90277556293017
resetting env. episode reward total was 1.0. running mean: 5.853747807300868
resetting env. episode reward total was 8.0. running mean: 5.875210329227859
resetting env. episode reward total was 11.0. running mean: 5.926458225935581
resetting env. episode reward total was 4.0. running mean: 5.907193643676225
resetting env. episode reward total was 2.0. running mean: 5.868121707239462
resetting env. episode reward total was 14.0. running mean: 5.949440490167067
resetting env. episode reward total was 13.0. running mean: 6.0199460852653965
resetting env. episode reward total was 11.0. running mean: 6.06974662441

resetting env. episode reward total was 5.0. running mean: 6.441007628131128
resetting env. episode reward total was 8.0. running mean: 6.456597551849817
resetting env. episode reward total was 1.0. running mean: 6.402031576331319
resetting env. episode reward total was 5.0. running mean: 6.388011260568005
resetting env. episode reward total was -2.0. running mean: 6.304131147962325
resetting env. episode reward total was 8.0. running mean: 6.3210898364827015
resetting env. episode reward total was 5.0. running mean: 6.307878938117875
resetting env. episode reward total was 15.0. running mean: 6.394800148736696
resetting env. episode reward total was 10.0. running mean: 6.430852147249328
resetting env. episode reward total was 6.0. running mean: 6.4265436257768345
resetting env. episode reward total was 5.0. running mean: 6.412278189519066
resetting env. episode reward total was -5.0. running mean: 6.2981554076238755
resetting env. episode reward total was 1.0. running mean: 6.24517385

resetting env. episode reward total was 7.0. running mean: 6.649347538588553
resetting env. episode reward total was 16.0. running mean: 6.7428540632026674
resetting env. episode reward total was -4.0. running mean: 6.635425522570641
resetting env. episode reward total was 8.0. running mean: 6.649071267344934
resetting env. episode reward total was 8.0. running mean: 6.662580554671485
resetting env. episode reward total was 9.0. running mean: 6.68595474912477
resetting env. episode reward total was -9.0. running mean: 6.529095201633522
resetting env. episode reward total was 4.0. running mean: 6.503804249617187
resetting env. episode reward total was -5.0. running mean: 6.388766207121015
resetting env. episode reward total was 11.0. running mean: 6.434878545049805
resetting env. episode reward total was -1.0. running mean: 6.360529759599307
resetting env. episode reward total was 5.0. running mean: 6.346924462003313
resetting env. episode reward total was 2.0. running mean: 6.303455217

resetting env. episode reward total was 11.0. running mean: 6.419616125100687
resetting env. episode reward total was 1.0. running mean: 6.365419963849679
resetting env. episode reward total was 15.0. running mean: 6.451765764211183
resetting env. episode reward total was 9.0. running mean: 6.477248106569071
resetting env. episode reward total was 9.0. running mean: 6.50247562550338
resetting env. episode reward total was -1.0. running mean: 6.4274508692483465
resetting env. episode reward total was 11.0. running mean: 6.473176360555863
resetting env. episode reward total was -11.0. running mean: 6.298444596950304
resetting env. episode reward total was 13.0. running mean: 6.365460150980801
resetting env. episode reward total was 4.0. running mean: 6.341805549470993
resetting env. episode reward total was 10.0. running mean: 6.378387493976283
resetting env. episode reward total was 13.0. running mean: 6.44460361903652
resetting env. episode reward total was -5.0. running mean: 6.330157

resetting env. episode reward total was 3.0. running mean: 7.012976313960435
resetting env. episode reward total was 2.0. running mean: 6.96284655082083
resetting env. episode reward total was 11.0. running mean: 7.003218085312621
resetting env. episode reward total was 12.0. running mean: 7.053185904459495
resetting env. episode reward total was -7.0. running mean: 6.912654045414899
resetting env. episode reward total was 9.0. running mean: 6.93352750496075
resetting env. episode reward total was 9.0. running mean: 6.9541922299111425
resetting env. episode reward total was 11.0. running mean: 6.994650307612031
resetting env. episode reward total was -1.0. running mean: 6.914703804535911
resetting env. episode reward total was 12.0. running mean: 6.965556766490552
resetting env. episode reward total was 4.0. running mean: 6.935901198825647
resetting env. episode reward total was 11.0. running mean: 6.976542186837391
resetting env. episode reward total was 9.0. running mean: 6.996776764

resetting env. episode reward total was 7.0. running mean: 6.478781682048949
resetting env. episode reward total was 6.0. running mean: 6.47399386522846
resetting env. episode reward total was 7.0. running mean: 6.479253926576176
resetting env. episode reward total was -1.0. running mean: 6.404461387310414
resetting env. episode reward total was 13.0. running mean: 6.47041677343731
resetting env. episode reward total was 7.0. running mean: 6.475712605702937
resetting env. episode reward total was 3.0. running mean: 6.4409554796459085
resetting env. episode reward total was 7.0. running mean: 6.44654592484945
resetting env. episode reward total was 15.0. running mean: 6.532080465600956
resetting env. episode reward total was 3.0. running mean: 6.496759660944947
resetting env. episode reward total was 9.0. running mean: 6.521792064335497
resetting env. episode reward total was 2.0. running mean: 6.476574143692142
resetting env. episode reward total was 5.0. running mean: 6.46180840225522

resetting env. episode reward total was 7.0. running mean: 6.356028828640434
resetting env. episode reward total was 13.0. running mean: 6.422468540354029
resetting env. episode reward total was 13.0. running mean: 6.488243854950489
resetting env. episode reward total was 1.0. running mean: 6.433361416400984
resetting env. episode reward total was 4.0. running mean: 6.409027802236974
resetting env. episode reward total was 6.0. running mean: 6.404937524214604
resetting env. episode reward total was -1.0. running mean: 6.330888148972457
resetting env. episode reward total was -2.0. running mean: 6.247579267482733
resetting env. episode reward total was 1.0. running mean: 6.1951034748079055
resetting env. episode reward total was 7.0. running mean: 6.203152440059827
resetting env. episode reward total was 13.0. running mean: 6.271120915659228
resetting env. episode reward total was -5.0. running mean: 6.158409706502636
resetting env. episode reward total was 3.0. running mean: 6.12682560

resetting env. episode reward total was -2.0. running mean: 5.7446977554540215
resetting env. episode reward total was 8.0. running mean: 5.7672507778994815
resetting env. episode reward total was 5.0. running mean: 5.759578270120486
resetting env. episode reward total was 6.0. running mean: 5.761982487419281
resetting env. episode reward total was 10.0. running mean: 5.8043626625450875
resetting env. episode reward total was 6.0. running mean: 5.806319035919636
resetting env. episode reward total was 6.0. running mean: 5.808255845560439
resetting env. episode reward total was 3.0. running mean: 5.7801732871048355
resetting env. episode reward total was 10.0. running mean: 5.822371554233786
resetting env. episode reward total was -7.0. running mean: 5.694147838691448
resetting env. episode reward total was -3.0. running mean: 5.6072063603045335
resetting env. episode reward total was 2.0. running mean: 5.571134296701488
resetting env. episode reward total was 10.0. running mean: 5.6154

resetting env. episode reward total was 11.0. running mean: 5.7833272564711775
resetting env. episode reward total was 7.0. running mean: 5.795493983906466
resetting env. episode reward total was 4.0. running mean: 5.777539044067401
resetting env. episode reward total was 8.0. running mean: 5.799763653626727
resetting env. episode reward total was 8.0. running mean: 5.82176601709046
resetting env. episode reward total was 1.0. running mean: 5.773548356919555
resetting env. episode reward total was 3.0. running mean: 5.74581287335036
resetting env. episode reward total was 8.0. running mean: 5.768354744616857
resetting env. episode reward total was 8.0. running mean: 5.790671197170688
resetting env. episode reward total was 3.0. running mean: 5.762764485198982
resetting env. episode reward total was 4.0. running mean: 5.7451368403469925
resetting env. episode reward total was 2.0. running mean: 5.707685471943522
resetting env. episode reward total was -4.0. running mean: 5.6106086172240

resetting env. episode reward total was 6.0. running mean: 6.076294770929344
resetting env. episode reward total was -7.0. running mean: 5.94553182322005
resetting env. episode reward total was 14.0. running mean: 6.0260765049878495
resetting env. episode reward total was 5.0. running mean: 6.01581573993797
resetting env. episode reward total was 11.0. running mean: 6.0656575825385906
resetting env. episode reward total was 13.0. running mean: 6.135001006713204
resetting env. episode reward total was 1.0. running mean: 6.083650996646072
resetting env. episode reward total was 1.0. running mean: 6.0328144866796105
resetting env. episode reward total was 9.0. running mean: 6.062486341812814
resetting env. episode reward total was 7.0. running mean: 6.071861478394687
resetting env. episode reward total was 5.0. running mean: 6.06114286361074
resetting env. episode reward total was -1.0. running mean: 5.990531434974632
resetting env. episode reward total was 6.0. running mean: 5.9906261206

resetting env. episode reward total was -6.0. running mean: 6.1496982202234465
resetting env. episode reward total was 13.0. running mean: 6.218201238021212
resetting env. episode reward total was 14.0. running mean: 6.2960192256409995
resetting env. episode reward total was 9.0. running mean: 6.323059033384589
resetting env. episode reward total was 8.0. running mean: 6.3398284430507434
resetting env. episode reward total was 3.0. running mean: 6.306430158620236
resetting env. episode reward total was 7.0. running mean: 6.313365857034034
resetting env. episode reward total was 14.0. running mean: 6.390232198463694
resetting env. episode reward total was 18.0. running mean: 6.5063298764790565
resetting env. episode reward total was 6.0. running mean: 6.501266577714265
resetting env. episode reward total was 13.0. running mean: 6.566253911937122
resetting env. episode reward total was 3.0. running mean: 6.530591372817751
resetting env. episode reward total was 14.0. running mean: 6.6052

resetting env. episode reward total was 11.0. running mean: 6.389488942878468
resetting env. episode reward total was 4.0. running mean: 6.3655940534496835
resetting env. episode reward total was 3.0. running mean: 6.331938112915187
resetting env. episode reward total was 8.0. running mean: 6.3486187317860345
resetting env. episode reward total was -6.0. running mean: 6.225132544468175
resetting env. episode reward total was -5.0. running mean: 6.112881219023493
resetting env. episode reward total was 14.0. running mean: 6.191752406833258
resetting env. episode reward total was -1.0. running mean: 6.119834882764925
resetting env. episode reward total was 13.0. running mean: 6.1886365339372755
resetting env. episode reward total was 15.0. running mean: 6.2767501685979035
resetting env. episode reward total was 7.0. running mean: 6.283982666911925
resetting env. episode reward total was -15.0. running mean: 6.071142840242805
resetting env. episode reward total was 7.0. running mean: 6.08

resetting env. episode reward total was 14.0. running mean: 6.401230349359652
resetting env. episode reward total was 9.0. running mean: 6.427218045866056
resetting env. episode reward total was 3.0. running mean: 6.392945865407396
resetting env. episode reward total was 11.0. running mean: 6.439016406753322
resetting env. episode reward total was 9.0. running mean: 6.464626242685789
resetting env. episode reward total was 5.0. running mean: 6.449979980258931
resetting env. episode reward total was -1.0. running mean: 6.375480180456342
resetting env. episode reward total was 16.0. running mean: 6.4717253786517785
resetting env. episode reward total was 4.0. running mean: 6.447008124865261
resetting env. episode reward total was 10.0. running mean: 6.482538043616608
resetting env. episode reward total was 14.0. running mean: 6.557712663180442
resetting env. episode reward total was 4.0. running mean: 6.532135536548637
resetting env. episode reward total was 6.0. running mean: 6.52681418

resetting env. episode reward total was 14.0. running mean: 6.864557029302862
resetting env. episode reward total was 9.0. running mean: 6.885911459009833
resetting env. episode reward total was 5.0. running mean: 6.867052344419735
resetting env. episode reward total was 3.0. running mean: 6.828381820975538
resetting env. episode reward total was 3.0. running mean: 6.790098002765783
resetting env. episode reward total was 12.0. running mean: 6.842197022738125
resetting env. episode reward total was 3.0. running mean: 6.803775052510744
resetting env. episode reward total was 10.0. running mean: 6.8357373019856364
resetting env. episode reward total was -1.0. running mean: 6.75737992896578
resetting env. episode reward total was 14.0. running mean: 6.829806129676122
resetting env. episode reward total was -1.0. running mean: 6.751508068379361
resetting env. episode reward total was 14.0. running mean: 6.823992987695567
resetting env. episode reward total was 7.0. running mean: 6.82575305

resetting env. episode reward total was 2.0. running mean: 7.863683664109491
resetting env. episode reward total was 4.0. running mean: 7.825046827468396
resetting env. episode reward total was 12.0. running mean: 7.866796359193713
resetting env. episode reward total was 16.0. running mean: 7.948128395601776
resetting env. episode reward total was 9.0. running mean: 7.9586471116457576
resetting env. episode reward total was 8.0. running mean: 7.9590606405293
resetting env. episode reward total was 6.0. running mean: 7.939470034124007
resetting env. episode reward total was 10.0. running mean: 7.9600753337827665
resetting env. episode reward total was 4.0. running mean: 7.920474580444939
resetting env. episode reward total was 1.0. running mean: 7.851269834640489
resetting env. episode reward total was 6.0. running mean: 7.832757136294084
resetting env. episode reward total was 16.0. running mean: 7.914429564931144
resetting env. episode reward total was 14.0. running mean: 7.9752852692

resetting env. episode reward total was 10.0. running mean: 7.891077741384704
resetting env. episode reward total was 8.0. running mean: 7.892166963970857
resetting env. episode reward total was 3.0. running mean: 7.843245294331149
resetting env. episode reward total was 8.0. running mean: 7.844812841387838
resetting env. episode reward total was 9.0. running mean: 7.856364712973959
resetting env. episode reward total was 10.0. running mean: 7.877801065844219
resetting env. episode reward total was 7.0. running mean: 7.869023055185777
resetting env. episode reward total was 12.0. running mean: 7.91033282463392
resetting env. episode reward total was 3.0. running mean: 7.861229496387581
resetting env. episode reward total was 8.0. running mean: 7.862617201423705
resetting env. episode reward total was 15.0. running mean: 7.9339910294094675
resetting env. episode reward total was 8.0. running mean: 7.934651119115373
resetting env. episode reward total was 9.0. running mean: 7.94530460792

resetting env. episode reward total was 12.0. running mean: 7.3672747039996285
resetting env. episode reward total was 3.0. running mean: 7.3236019569596325
resetting env. episode reward total was 3.0. running mean: 7.2803659373900365
resetting env. episode reward total was -3.0. running mean: 7.1775622780161354
resetting env. episode reward total was 2.0. running mean: 7.125786655235974
resetting env. episode reward total was 3.0. running mean: 7.084528788683614
resetting env. episode reward total was 7.0. running mean: 7.083683500796778
resetting env. episode reward total was 1.0. running mean: 7.02284666578881
resetting env. episode reward total was 12.0. running mean: 7.072618199130922
resetting env. episode reward total was -4.0. running mean: 6.961892017139612
resetting env. episode reward total was 7.0. running mean: 6.962273096968216
resetting env. episode reward total was -2.0. running mean: 6.872650365998535
resetting env. episode reward total was 15.0. running mean: 6.953923

resetting env. episode reward total was 13.0. running mean: 6.922828894182203
resetting env. episode reward total was 12.0. running mean: 6.973600605240381
resetting env. episode reward total was 14.0. running mean: 7.043864599187977
resetting env. episode reward total was 7.0. running mean: 7.043425953196097
resetting env. episode reward total was 11.0. running mean: 7.082991693664137
resetting env. episode reward total was 12.0. running mean: 7.132161776727496
resetting env. episode reward total was 16.0. running mean: 7.220840158960221
resetting env. episode reward total was 5.0. running mean: 7.1986317573706184
resetting env. episode reward total was 5.0. running mean: 7.176645439796912
resetting env. episode reward total was -7.0. running mean: 7.034878985398943
resetting env. episode reward total was -2.0. running mean: 6.944530195544954
resetting env. episode reward total was 10.0. running mean: 6.975084893589504
resetting env. episode reward total was 14.0. running mean: 7.0453

resetting env. episode reward total was 3.0. running mean: 7.188807097954717
resetting env. episode reward total was 9.0. running mean: 7.20691902697517
resetting env. episode reward total was 4.0. running mean: 7.174849836705418
resetting env. episode reward total was 12.0. running mean: 7.223101338338363
resetting env. episode reward total was 11.0. running mean: 7.26087032495498
resetting env. episode reward total was 6.0. running mean: 7.24826162170543
resetting env. episode reward total was 12.0. running mean: 7.2957790054883755
resetting env. episode reward total was 9.0. running mean: 7.312821215433492
resetting env. episode reward total was 9.0. running mean: 7.329693003279157
resetting env. episode reward total was 9.0. running mean: 7.346396073246366
resetting env. episode reward total was 5.0. running mean: 7.322932112513902
resetting env. episode reward total was 9.0. running mean: 7.3397027913887625
resetting env. episode reward total was 9.0. running mean: 7.3563057634748

resetting env. episode reward total was 10.0. running mean: 6.7523093990636065
resetting env. episode reward total was 9.0. running mean: 6.77478630507297
resetting env. episode reward total was 6.0. running mean: 6.7670384420222405
resetting env. episode reward total was 13.0. running mean: 6.829368057602018
resetting env. episode reward total was 15.0. running mean: 6.911074377025998
resetting env. episode reward total was 11.0. running mean: 6.951963633255739
resetting env. episode reward total was 15.0. running mean: 7.032443996923181
resetting env. episode reward total was 8.0. running mean: 7.042119556953949
resetting env. episode reward total was 7.0. running mean: 7.04169836138441
resetting env. episode reward total was 6.0. running mean: 7.031281377770565
resetting env. episode reward total was 12.0. running mean: 7.080968563992859
resetting env. episode reward total was 9.0. running mean: 7.10015887835293
resetting env. episode reward total was 12.0. running mean: 7.149157289

resetting env. episode reward total was 11.0. running mean: 7.14833350834457
resetting env. episode reward total was 12.0. running mean: 7.196850173261124
resetting env. episode reward total was 7.0. running mean: 7.194881671528513
resetting env. episode reward total was 15.0. running mean: 7.272932854813228
resetting env. episode reward total was 4.0. running mean: 7.240203526265096
resetting env. episode reward total was 10.0. running mean: 7.267801491002444
resetting env. episode reward total was 2.0. running mean: 7.21512347609242
resetting env. episode reward total was 8.0. running mean: 7.222972241331496
resetting env. episode reward total was 5.0. running mean: 7.20074251891818
resetting env. episode reward total was 4.0. running mean: 7.168735093728999
resetting env. episode reward total was 8.0. running mean: 7.177047742791709
resetting env. episode reward total was 13.0. running mean: 7.235277265363791
resetting env. episode reward total was 10.0. running mean: 7.262924492710

resetting env. episode reward total was 7.0. running mean: 8.11306037441522
resetting env. episode reward total was -3.0. running mean: 8.001929770671069
resetting env. episode reward total was 3.0. running mean: 7.951910472964358
resetting env. episode reward total was 4.0. running mean: 7.912391368234714
resetting env. episode reward total was 12.0. running mean: 7.953267454552367
resetting env. episode reward total was 11.0. running mean: 7.983734780006843
resetting env. episode reward total was 5.0. running mean: 7.953897432206775
resetting env. episode reward total was 5.0. running mean: 7.924358457884707
resetting env. episode reward total was -3.0. running mean: 7.815114873305859
resetting env. episode reward total was 1.0. running mean: 7.7469637245728
resetting env. episode reward total was 11.0. running mean: 7.779494087327072
resetting env. episode reward total was 4.0. running mean: 7.741699146453802
resetting env. episode reward total was 2.0. running mean: 7.6842821549892

resetting env. episode reward total was 13.0. running mean: 8.074377288696537
resetting env. episode reward total was 6.0. running mean: 8.053633515809572
resetting env. episode reward total was 12.0. running mean: 8.093097180651476
resetting env. episode reward total was 9.0. running mean: 8.102166208844961
resetting env. episode reward total was 10.0. running mean: 8.121144546756511
resetting env. episode reward total was 15.0. running mean: 8.189933101288947
resetting env. episode reward total was 10.0. running mean: 8.208033770276057
resetting env. episode reward total was -1.0. running mean: 8.115953432573296
resetting env. episode reward total was -6.0. running mean: 7.974793898247563
resetting env. episode reward total was 18.0. running mean: 8.075045959265086
resetting env. episode reward total was 7.0. running mean: 8.064295499672435
resetting env. episode reward total was 5.0. running mean: 8.03365254467571
resetting env. episode reward total was 5.0. running mean: 8.00331601

resetting env. episode reward total was 5.0. running mean: 8.15000972364655
resetting env. episode reward total was 11.0. running mean: 8.178509626410083
resetting env. episode reward total was 12.0. running mean: 8.216724530145981
resetting env. episode reward total was 10.0. running mean: 8.234557284844522
resetting env. episode reward total was -5.0. running mean: 8.102211711996075
resetting env. episode reward total was 9.0. running mean: 8.111189594876114
resetting env. episode reward total was -1.0. running mean: 8.020077698927354
resetting env. episode reward total was 5.0. running mean: 7.98987692193808
resetting env. episode reward total was 2.0. running mean: 7.929978152718699
resetting env. episode reward total was 2.0. running mean: 7.870678371191512
resetting env. episode reward total was 13.0. running mean: 7.921971587479597
resetting env. episode reward total was 13.0. running mean: 7.9727518716048005
resetting env. episode reward total was 2.0. running mean: 7.913024352

resetting env. episode reward total was 2.0. running mean: 7.348191268611261
resetting env. episode reward total was 14.0. running mean: 7.414709355925148
resetting env. episode reward total was 5.0. running mean: 7.390562262365896
resetting env. episode reward total was 12.0. running mean: 7.436656639742237
resetting env. episode reward total was 9.0. running mean: 7.452290073344814
resetting env. episode reward total was 10.0. running mean: 7.477767172611366
resetting env. episode reward total was 8.0. running mean: 7.482989500885252
resetting env. episode reward total was 15.0. running mean: 7.5581596058764005
resetting env. episode reward total was 14.0. running mean: 7.622578009817636
resetting env. episode reward total was 7.0. running mean: 7.61635222971946
resetting env. episode reward total was 10.0. running mean: 7.640188707422265
resetting env. episode reward total was 4.0. running mean: 7.603786820348042
resetting env. episode reward total was 7.0. running mean: 7.597748952

resetting env. episode reward total was 11.0. running mean: 8.086096098281121
resetting env. episode reward total was 4.0. running mean: 8.04523513729831
resetting env. episode reward total was 4.0. running mean: 8.004782785925325
resetting env. episode reward total was 14.0. running mean: 8.064734958066072
resetting env. episode reward total was 15.0. running mean: 8.13408760848541
resetting env. episode reward total was 3.0. running mean: 8.082746732400556
resetting env. episode reward total was -1.0. running mean: 7.99191926507655
resetting env. episode reward total was 15.0. running mean: 8.062000072425784
resetting env. episode reward total was 6.0. running mean: 8.041380071701527
resetting env. episode reward total was 3.0. running mean: 7.9909662709845115
resetting env. episode reward total was 13.0. running mean: 8.041056608274667
resetting env. episode reward total was 7.0. running mean: 8.03064604219192
resetting env. episode reward total was 11.0. running mean: 8.06033958177

resetting env. episode reward total was 8.0. running mean: 8.065270101476619
resetting env. episode reward total was 6.0. running mean: 8.044617400461853
resetting env. episode reward total was -3.0. running mean: 7.9341712264572335
resetting env. episode reward total was 10.0. running mean: 7.954829514192661
resetting env. episode reward total was 6.0. running mean: 7.935281219050734
resetting env. episode reward total was 13.0. running mean: 7.985928406860226
resetting env. episode reward total was 4.0. running mean: 7.946069122791624
resetting env. episode reward total was 16.0. running mean: 8.026608431563707
resetting env. episode reward total was -1.0. running mean: 7.93634234724807
resetting env. episode reward total was 4.0. running mean: 7.896978923775589
resetting env. episode reward total was 10.0. running mean: 7.918009134537833
resetting env. episode reward total was 1.0. running mean: 7.848829043192454
resetting env. episode reward total was 12.0. running mean: 7.89034075

resetting env. episode reward total was 2.0. running mean: 8.06095644979225
resetting env. episode reward total was 19.0. running mean: 8.170346885294327
resetting env. episode reward total was 13.0. running mean: 8.218643416441385
resetting env. episode reward total was 6.0. running mean: 8.19645698227697
resetting env. episode reward total was 13.0. running mean: 8.244492412454202
resetting env. episode reward total was 8.0. running mean: 8.242047488329659
resetting env. episode reward total was 10.0. running mean: 8.259627013446362
resetting env. episode reward total was 10.0. running mean: 8.277030743311899
resetting env. episode reward total was 14.0. running mean: 8.33426043587878
resetting env. episode reward total was 5.0. running mean: 8.300917831519994
resetting env. episode reward total was 4.0. running mean: 8.257908653204794
resetting env. episode reward total was 8.0. running mean: 8.255329566672746
resetting env. episode reward total was 6.0. running mean: 8.232776271006

resetting env. episode reward total was 14.0. running mean: 7.787796918075743
resetting env. episode reward total was -3.0. running mean: 7.679918948894985
resetting env. episode reward total was 3.0. running mean: 7.633119759406035
resetting env. episode reward total was 12.0. running mean: 7.676788561811974
resetting env. episode reward total was 5.0. running mean: 7.650020676193854
resetting env. episode reward total was 2.0. running mean: 7.593520469431915
resetting env. episode reward total was 12.0. running mean: 7.637585264737596
resetting env. episode reward total was 7.0. running mean: 7.63120941209022
resetting env. episode reward total was 5.0. running mean: 7.604897317969318
resetting env. episode reward total was 2.0. running mean: 7.548848344789624
resetting env. episode reward total was 14.0. running mean: 7.613359861341728
resetting env. episode reward total was 10.0. running mean: 7.63722626272831
resetting env. episode reward total was 12.0. running mean: 7.6808540001

resetting env. episode reward total was 6.0. running mean: 7.970525489828055
resetting env. episode reward total was 10.0. running mean: 7.990820234929774
resetting env. episode reward total was 13.0. running mean: 8.040912032580476
resetting env. episode reward total was 12.0. running mean: 8.080502912254671
resetting env. episode reward total was 13.0. running mean: 8.129697883132124
resetting env. episode reward total was 11.0. running mean: 8.158400904300802
resetting env. episode reward total was 8.0. running mean: 8.156816895257794
resetting env. episode reward total was 11.0. running mean: 8.185248726305215
resetting env. episode reward total was 5.0. running mean: 8.153396239042163
resetting env. episode reward total was 11.0. running mean: 8.18186227665174
resetting env. episode reward total was 7.0. running mean: 8.170043653885223
resetting env. episode reward total was 9.0. running mean: 8.17834321734637
resetting env. episode reward total was 5.0. running mean: 8.1465597851

resetting env. episode reward total was 14.0. running mean: 7.609871635183119
resetting env. episode reward total was 9.0. running mean: 7.623772918831287
resetting env. episode reward total was 7.0. running mean: 7.617535189642974
resetting env. episode reward total was 4.0. running mean: 7.581359837746545
resetting env. episode reward total was 2.0. running mean: 7.525546239369079
resetting env. episode reward total was 2.0. running mean: 7.4702907769753875
resetting env. episode reward total was 9.0. running mean: 7.4855878692056335
resetting env. episode reward total was 9.0. running mean: 7.500731990513577
resetting env. episode reward total was 10.0. running mean: 7.525724670608441
resetting env. episode reward total was 12.0. running mean: 7.570467423902357
resetting env. episode reward total was 8.0. running mean: 7.574762749663333
resetting env. episode reward total was 15.0. running mean: 7.6490151221667
resetting env. episode reward total was 15.0. running mean: 7.7225249709

resetting env. episode reward total was -5.0. running mean: 7.910589856241831
resetting env. episode reward total was -9.0. running mean: 7.741483957679413
resetting env. episode reward total was 12.0. running mean: 7.784069118102619
resetting env. episode reward total was 4.0. running mean: 7.746228426921593
resetting env. episode reward total was 8.0. running mean: 7.748766142652377
resetting env. episode reward total was 6.0. running mean: 7.731278481225853
resetting env. episode reward total was 5.0. running mean: 7.703965696413594
resetting env. episode reward total was 12.0. running mean: 7.746926039449458
resetting env. episode reward total was 1.0. running mean: 7.679456779054963
resetting env. episode reward total was 3.0. running mean: 7.632662211264414
resetting env. episode reward total was 1.0. running mean: 7.56633558915177
resetting env. episode reward total was 2.0. running mean: 7.510672233260252
resetting env. episode reward total was 11.0. running mean: 7.54556551092

resetting env. episode reward total was 9.0. running mean: 7.381548785494562
resetting env. episode reward total was 14.0. running mean: 7.4477332976396164
resetting env. episode reward total was 8.0. running mean: 7.45325596466322
resetting env. episode reward total was 10.0. running mean: 7.478723405016587
resetting env. episode reward total was 9.0. running mean: 7.493936170966421
resetting env. episode reward total was 6.0. running mean: 7.478996809256756
resetting env. episode reward total was 2.0. running mean: 7.424206841164188
resetting env. episode reward total was 5.0. running mean: 7.399964772752546
resetting env. episode reward total was -3.0. running mean: 7.29596512502502
resetting env. episode reward total was 7.0. running mean: 7.2930054737747705
resetting env. episode reward total was 11.0. running mean: 7.330075419037023
resetting env. episode reward total was 6.0. running mean: 7.316774664846652
resetting env. episode reward total was 4.0. running mean: 7.28360691819

resetting env. episode reward total was -9.0. running mean: 7.200733218659854
resetting env. episode reward total was 14.0. running mean: 7.268725886473256
resetting env. episode reward total was 8.0. running mean: 7.276038627608523
resetting env. episode reward total was 11.0. running mean: 7.313278241332438
resetting env. episode reward total was 11.0. running mean: 7.350145458919114
resetting env. episode reward total was 4.0. running mean: 7.316644004329922
resetting env. episode reward total was 8.0. running mean: 7.323477564286623
resetting env. episode reward total was 8.0. running mean: 7.330242788643757
resetting env. episode reward total was 9.0. running mean: 7.346940360757319
resetting env. episode reward total was 4.0. running mean: 7.313470957149746
resetting env. episode reward total was 6.0. running mean: 7.300336247578248
resetting env. episode reward total was 2.0. running mean: 7.247332885102465
resetting env. episode reward total was 1.0. running mean: 7.18485955625

resetting env. episode reward total was 9.0. running mean: 7.326345855024457
resetting env. episode reward total was 13.0. running mean: 7.3830823964742125
resetting env. episode reward total was 6.0. running mean: 7.36925157250947
resetting env. episode reward total was 9.0. running mean: 7.385559056784375
resetting env. episode reward total was 12.0. running mean: 7.431703466216532
resetting env. episode reward total was 14.0. running mean: 7.497386431554366
resetting env. episode reward total was 15.0. running mean: 7.572412567238823
resetting env. episode reward total was 4.0. running mean: 7.536688441566435
resetting env. episode reward total was 1.0. running mean: 7.4713215571507705
resetting env. episode reward total was 3.0. running mean: 7.426608341579263
resetting env. episode reward total was 15.0. running mean: 7.502342258163471
resetting env. episode reward total was 13.0. running mean: 7.557318835581836
resetting env. episode reward total was 8.0. running mean: 7.56174564

resetting env. episode reward total was 13.0. running mean: 7.275297030891126
resetting env. episode reward total was 2.0. running mean: 7.222544060582215
resetting env. episode reward total was 1.0. running mean: 7.160318619976392
resetting env. episode reward total was 5.0. running mean: 7.138715433776628
resetting env. episode reward total was 5.0. running mean: 7.117328279438862
resetting env. episode reward total was 1.0. running mean: 7.056154996644473
resetting env. episode reward total was 9.0. running mean: 7.075593446678028
resetting env. episode reward total was 10.0. running mean: 7.1048375122112475
resetting env. episode reward total was 5.0. running mean: 7.083789137089135
resetting env. episode reward total was 6.0. running mean: 7.072951245718243
resetting env. episode reward total was 10.0. running mean: 7.10222173326106
resetting env. episode reward total was 8.0. running mean: 7.111199515928449
resetting env. episode reward total was -7.0. running mean: 6.97008752076

resetting env. episode reward total was 11.0. running mean: 7.33510919665376
resetting env. episode reward total was 9.0. running mean: 7.351758104687222
resetting env. episode reward total was 10.0. running mean: 7.378240523640349
resetting env. episode reward total was 11.0. running mean: 7.414458118403946
resetting env. episode reward total was 8.0. running mean: 7.420313537219907
resetting env. episode reward total was 6.0. running mean: 7.406110401847707
resetting env. episode reward total was 2.0. running mean: 7.35204929782923
resetting env. episode reward total was 17.0. running mean: 7.4485288048509375
resetting env. episode reward total was 11.0. running mean: 7.484043516802428
resetting env. episode reward total was 8.0. running mean: 7.489203081634404
resetting env. episode reward total was 5.0. running mean: 7.4643110508180595
resetting env. episode reward total was -7.0. running mean: 7.319667940309879
resetting env. episode reward total was 3.0. running mean: 7.276471260

resetting env. episode reward total was 10.0. running mean: 6.73695947908941
resetting env. episode reward total was 1.0. running mean: 6.679589884298515
resetting env. episode reward total was 3.0. running mean: 6.642793985455531
resetting env. episode reward total was 8.0. running mean: 6.656366045600976
resetting env. episode reward total was 6.0. running mean: 6.649802385144965
resetting env. episode reward total was 15.0. running mean: 6.7333043612935155
resetting env. episode reward total was 11.0. running mean: 6.775971317680581
resetting env. episode reward total was 10.0. running mean: 6.808211604503774
resetting env. episode reward total was 12.0. running mean: 6.860129488458736
resetting env. episode reward total was 13.0. running mean: 6.921528193574149
resetting env. episode reward total was 10.0. running mean: 6.9523129116384075
resetting env. episode reward total was 15.0. running mean: 7.032789782522023
resetting env. episode reward total was 9.0. running mean: 7.052461

resetting env. episode reward total was 6.0. running mean: 7.226257628505598
resetting env. episode reward total was 9.0. running mean: 7.243995052220542
resetting env. episode reward total was 7.0. running mean: 7.241555101698337
resetting env. episode reward total was 10.0. running mean: 7.269139550681353
resetting env. episode reward total was 15.0. running mean: 7.3464481551745395
resetting env. episode reward total was 4.0. running mean: 7.312983673622794
resetting env. episode reward total was 4.0. running mean: 7.279853836886566
resetting env. episode reward total was 5.0. running mean: 7.257055298517701
resetting env. episode reward total was 6.0. running mean: 7.244484745532523
resetting env. episode reward total was 7.0. running mean: 7.242039898077198
resetting env. episode reward total was 6.0. running mean: 7.229619499096425
resetting env. episode reward total was 16.0. running mean: 7.317323304105461
resetting env. episode reward total was 1.0. running mean: 7.25415007106

resetting env. episode reward total was 9.0. running mean: 7.839653894088923
resetting env. episode reward total was 8.0. running mean: 7.841257355148034
resetting env. episode reward total was 4.0. running mean: 7.802844781596554
resetting env. episode reward total was 6.0. running mean: 7.784816333780588
resetting env. episode reward total was -5.0. running mean: 7.6569681704427826
resetting env. episode reward total was 4.0. running mean: 7.6203984887383545
resetting env. episode reward total was 2.0. running mean: 7.56419450385097
resetting env. episode reward total was 8.0. running mean: 7.568552558812461
resetting env. episode reward total was 7.0. running mean: 7.562867033224336
resetting env. episode reward total was 8.0. running mean: 7.567238362892093
resetting env. episode reward total was 16.0. running mean: 7.6515659792631725
resetting env. episode reward total was -5.0. running mean: 7.525050319470541
resetting env. episode reward total was 9.0. running mean: 7.5397998162

resetting env. episode reward total was 5.0. running mean: 7.220286070190436
resetting env. episode reward total was 11.0. running mean: 7.258083209488532
resetting env. episode reward total was -2.0. running mean: 7.165502377393647
resetting env. episode reward total was 14.0. running mean: 7.23384735361971
resetting env. episode reward total was 3.0. running mean: 7.191508880083513
resetting env. episode reward total was 11.0. running mean: 7.229593791282679
resetting env. episode reward total was 14.0. running mean: 7.297297853369852
resetting env. episode reward total was 13.0. running mean: 7.354324874836153
resetting env. episode reward total was 8.0. running mean: 7.360781626087792
resetting env. episode reward total was 9.0. running mean: 7.377173809826913
resetting env. episode reward total was 13.0. running mean: 7.4334020717286435
resetting env. episode reward total was 6.0. running mean: 7.419068051011356
resetting env. episode reward total was 15.0. running mean: 7.4948773

resetting env. episode reward total was 2.0. running mean: 7.875182288742912
resetting env. episode reward total was 8.0. running mean: 7.876430465855483
resetting env. episode reward total was 9.0. running mean: 7.887666161196928
resetting env. episode reward total was -3.0. running mean: 7.778789499584959
resetting env. episode reward total was 11.0. running mean: 7.8110016045891095
resetting env. episode reward total was 11.0. running mean: 7.842891588543218
resetting env. episode reward total was 4.0. running mean: 7.804462672657786
resetting env. episode reward total was 16.0. running mean: 7.886418045931209
resetting env. episode reward total was 8.0. running mean: 7.887553865471897
resetting env. episode reward total was 9.0. running mean: 7.898678326817178
resetting env. episode reward total was -3.0. running mean: 7.789691543549006
resetting env. episode reward total was 5.0. running mean: 7.761794628113516
resetting env. episode reward total was 10.0. running mean: 7.78417668

resetting env. episode reward total was 5.0. running mean: 8.440426494637094
resetting env. episode reward total was 10.0. running mean: 8.456022229690722
resetting env. episode reward total was 7.0. running mean: 8.441462007393815
resetting env. episode reward total was 13.0. running mean: 8.487047387319878
resetting env. episode reward total was 8.0. running mean: 8.48217691344668
resetting env. episode reward total was 11.0. running mean: 8.507355144312212
resetting env. episode reward total was 10.0. running mean: 8.52228159286909
resetting env. episode reward total was 9.0. running mean: 8.527058776940398
resetting env. episode reward total was 17.0. running mean: 8.611788189170994
resetting env. episode reward total was 3.0. running mean: 8.555670307279284
resetting env. episode reward total was 8.0. running mean: 8.55011360420649
resetting env. episode reward total was 1.0. running mean: 8.474612468164425
resetting env. episode reward total was 18.0. running mean: 8.569866343482

resetting env. episode reward total was 13.0. running mean: 8.055091269995343
resetting env. episode reward total was 7.0. running mean: 8.04454035729539
resetting env. episode reward total was 6.0. running mean: 8.024094953722436
resetting env. episode reward total was 11.0. running mean: 8.053854004185212
resetting env. episode reward total was 14.0. running mean: 8.11331546414336
resetting env. episode reward total was 5.0. running mean: 8.082182309501928
resetting env. episode reward total was 14.0. running mean: 8.141360486406908
resetting env. episode reward total was 2.0. running mean: 8.079946881542838
resetting env. episode reward total was 5.0. running mean: 8.049147412727411
resetting env. episode reward total was 12.0. running mean: 8.088655938600137
resetting env. episode reward total was 14.0. running mean: 8.147769379214136
resetting env. episode reward total was -3.0. running mean: 8.036291685421995
resetting env. episode reward total was 16.0. running mean: 8.115928768

resetting env. episode reward total was 17.0. running mean: 8.549910130101729
resetting env. episode reward total was 9.0. running mean: 8.554411028800711
resetting env. episode reward total was 13.0. running mean: 8.598866918512705
resetting env. episode reward total was 18.0. running mean: 8.692878249327578
resetting env. episode reward total was 8.0. running mean: 8.685949466834302
resetting env. episode reward total was 1.0. running mean: 8.60908997216596
resetting env. episode reward total was 7.0. running mean: 8.5929990724443
resetting env. episode reward total was 8.0. running mean: 8.587069081719857
resetting env. episode reward total was 4.0. running mean: 8.541198390902657
resetting env. episode reward total was 2.0. running mean: 8.475786406993631
resetting env. episode reward total was 5.0. running mean: 8.441028542923695
resetting env. episode reward total was 9.0. running mean: 8.446618257494457
resetting env. episode reward total was 13.0. running mean: 8.49215207491951

resetting env. episode reward total was 10.0. running mean: 8.13561439796105
resetting env. episode reward total was 2.0. running mean: 8.074258253981439
resetting env. episode reward total was 10.0. running mean: 8.093515671441624
resetting env. episode reward total was 15.0. running mean: 8.162580514727209
resetting env. episode reward total was 7.0. running mean: 8.150954709579937
resetting env. episode reward total was 11.0. running mean: 8.179445162484136
resetting env. episode reward total was 5.0. running mean: 8.147650710859295
resetting env. episode reward total was 6.0. running mean: 8.126174203750702
resetting env. episode reward total was 11.0. running mean: 8.154912461713195
resetting env. episode reward total was -5.0. running mean: 8.023363337096063
resetting env. episode reward total was 13.0. running mean: 8.073129703725103
resetting env. episode reward total was 9.0. running mean: 8.082398406687853
resetting env. episode reward total was 3.0. running mean: 8.031574422

resetting env. episode reward total was 4.0. running mean: 8.626951070868618
resetting env. episode reward total was 11.0. running mean: 8.650681560159931
resetting env. episode reward total was 9.0. running mean: 8.654174744558333
resetting env. episode reward total was 6.0. running mean: 8.62763299711275
resetting env. episode reward total was 10.0. running mean: 8.641356667141622
resetting env. episode reward total was 8.0. running mean: 8.634943100470206
resetting env. episode reward total was -9.0. running mean: 8.458593669465504
resetting env. episode reward total was 15.0. running mean: 8.52400773277085
resetting env. episode reward total was 5.0. running mean: 8.48876765544314
resetting env. episode reward total was 1.0. running mean: 8.413879978888708
resetting env. episode reward total was 14.0. running mean: 8.469741179099822
resetting env. episode reward total was 9.0. running mean: 8.475043767308824
resetting env. episode reward total was 7.0. running mean: 8.4602933296357

resetting env. episode reward total was 4.0. running mean: 8.447621469795845
resetting env. episode reward total was 14.0. running mean: 8.503145255097886
resetting env. episode reward total was 10.0. running mean: 8.518113802546907
resetting env. episode reward total was 14.0. running mean: 8.572932664521439
resetting env. episode reward total was 16.0. running mean: 8.647203337876224
resetting env. episode reward total was 6.0. running mean: 8.620731304497463
resetting env. episode reward total was 8.0. running mean: 8.614523991452488
resetting env. episode reward total was 6.0. running mean: 8.588378751537963
resetting env. episode reward total was 8.0. running mean: 8.582494964022583
resetting env. episode reward total was 6.0. running mean: 8.556670014382357
resetting env. episode reward total was 5.0. running mean: 8.521103314238534
resetting env. episode reward total was 6.0. running mean: 8.495892281096149
resetting env. episode reward total was 11.0. running mean: 8.5209333582

resetting env. episode reward total was 11.0. running mean: 8.528633075741128
resetting env. episode reward total was 11.0. running mean: 8.553346744983717
resetting env. episode reward total was 5.0. running mean: 8.51781327753388
resetting env. episode reward total was 14.0. running mean: 8.572635144758543
resetting env. episode reward total was -7.0. running mean: 8.416908793310958
resetting env. episode reward total was 8.0. running mean: 8.412739705377849
resetting env. episode reward total was 16.0. running mean: 8.48861230832407
resetting env. episode reward total was 8.0. running mean: 8.48372618524083
resetting env. episode reward total was 13.0. running mean: 8.528888923388422
resetting env. episode reward total was 13.0. running mean: 8.573600034154538
resetting env. episode reward total was 6.0. running mean: 8.547864033812992
resetting env. episode reward total was 9.0. running mean: 8.552385393474863
resetting env. episode reward total was 2.0. running mean: 8.48686153954

resetting env. episode reward total was 13.0. running mean: 8.509908034632605
resetting env. episode reward total was 5.0. running mean: 8.47480895428628
resetting env. episode reward total was 9.0. running mean: 8.480060864743416
resetting env. episode reward total was 6.0. running mean: 8.455260256095983
resetting env. episode reward total was 4.0. running mean: 8.410707653535022
resetting env. episode reward total was 7.0. running mean: 8.396600576999672
resetting env. episode reward total was 7.0. running mean: 8.382634571229676
resetting env. episode reward total was 11.0. running mean: 8.40880822551738
resetting env. episode reward total was 12.0. running mean: 8.444720143262204
resetting env. episode reward total was 9.0. running mean: 8.450272941829581
resetting env. episode reward total was 6.0. running mean: 8.425770212411287
resetting env. episode reward total was 14.0. running mean: 8.481512510287175
resetting env. episode reward total was 13.0. running mean: 8.526697385184

resetting env. episode reward total was 14.0. running mean: 8.522391757555289
resetting env. episode reward total was 17.0. running mean: 8.607167839979736
resetting env. episode reward total was 7.0. running mean: 8.591096161579939
resetting env. episode reward total was 6.0. running mean: 8.56518519996414
resetting env. episode reward total was 7.0. running mean: 8.5495333479645
resetting env. episode reward total was -3.0. running mean: 8.434038014484855
resetting env. episode reward total was -3.0. running mean: 8.319697634340008
resetting env. episode reward total was 10.0. running mean: 8.336500657996607
resetting env. episode reward total was 10.0. running mean: 8.353135651416641
resetting env. episode reward total was 8.0. running mean: 8.349604294902475
resetting env. episode reward total was 15.0. running mean: 8.416108251953451
resetting env. episode reward total was 5.0. running mean: 8.381947169433918
resetting env. episode reward total was 12.0. running mean: 8.4181276977

resetting env. episode reward total was 11.0. running mean: 8.538500197993613
resetting env. episode reward total was -1.0. running mean: 8.443115196013677
resetting env. episode reward total was 11.0. running mean: 8.46868404405354
resetting env. episode reward total was 14.0. running mean: 8.523997203613003
resetting env. episode reward total was 11.0. running mean: 8.548757231576873
resetting env. episode reward total was 5.0. running mean: 8.513269659261105
resetting env. episode reward total was 14.0. running mean: 8.568136962668495
resetting env. episode reward total was 11.0. running mean: 8.592455593041809
resetting env. episode reward total was -7.0. running mean: 8.43653103711139
resetting env. episode reward total was 13.0. running mean: 8.482165726740277
resetting env. episode reward total was 8.0. running mean: 8.477344069472874
resetting env. episode reward total was 10.0. running mean: 8.492570628778145
resetting env. episode reward total was 1.0. running mean: 8.4176449

resetting env. episode reward total was 13.0. running mean: 8.979603987637022
resetting env. episode reward total was 9.0. running mean: 8.979807947760651
resetting env. episode reward total was 13.0. running mean: 9.020009868283045
resetting env. episode reward total was 12.0. running mean: 9.049809769600214
resetting env. episode reward total was 5.0. running mean: 9.009311671904213
resetting env. episode reward total was 2.0. running mean: 8.93921855518517
resetting env. episode reward total was 5.0. running mean: 8.899826369633319
resetting env. episode reward total was 4.0. running mean: 8.850828105936985
resetting env. episode reward total was 11.0. running mean: 8.872319824877614
resetting env. episode reward total was 8.0. running mean: 8.863596626628839
resetting env. episode reward total was -3.0. running mean: 8.744960660362551
resetting env. episode reward total was -2.0. running mean: 8.637511053758926
resetting env. episode reward total was 12.0. running mean: 8.671135943

resetting env. episode reward total was 12.0. running mean: 9.1121568802631
resetting env. episode reward total was 9.0. running mean: 9.11103531146047
resetting env. episode reward total was 6.0. running mean: 9.079924958345865
resetting env. episode reward total was 6.0. running mean: 9.049125708762407
resetting env. episode reward total was 11.0. running mean: 9.068634451674782
resetting env. episode reward total was 4.0. running mean: 9.017948107158034
resetting env. episode reward total was 12.0. running mean: 9.047768626086453
resetting env. episode reward total was 11.0. running mean: 9.067290939825588
resetting env. episode reward total was 3.0. running mean: 9.006618030427331
resetting env. episode reward total was 10.0. running mean: 9.016551850123058
resetting env. episode reward total was 3.0. running mean: 8.956386331621827
resetting env. episode reward total was 4.0. running mean: 8.906822468305608
resetting env. episode reward total was 6.0. running mean: 8.8777542436225

resetting env. episode reward total was 14.0. running mean: 9.073511708787418
resetting env. episode reward total was -3.0. running mean: 8.952776591699545
resetting env. episode reward total was 16.0. running mean: 9.023248825782549
resetting env. episode reward total was 11.0. running mean: 9.043016337524723
resetting env. episode reward total was 9.0. running mean: 9.042586174149475
resetting env. episode reward total was 12.0. running mean: 9.072160312407979
resetting env. episode reward total was 10.0. running mean: 9.081438709283898
resetting env. episode reward total was 8.0. running mean: 9.070624322191058
resetting env. episode reward total was 11.0. running mean: 9.089918078969147
resetting env. episode reward total was 11.0. running mean: 9.109018898179455
resetting env. episode reward total was -9.0. running mean: 8.92792870919766
resetting env. episode reward total was 6.0. running mean: 8.898649422105684
resetting env. episode reward total was 14.0. running mean: 8.949662

resetting env. episode reward total was 14.0. running mean: 9.003515421931633
resetting env. episode reward total was 13.0. running mean: 9.043480267712317
resetting env. episode reward total was 4.0. running mean: 8.993045465035193
resetting env. episode reward total was 10.0. running mean: 9.00311501038484
resetting env. episode reward total was 11.0. running mean: 9.023083860280991
resetting env. episode reward total was 5.0. running mean: 8.982853021678181
resetting env. episode reward total was 11.0. running mean: 9.0030244914614
resetting env. episode reward total was 10.0. running mean: 9.012994246546786
resetting env. episode reward total was 1.0. running mean: 8.932864304081317
resetting env. episode reward total was 13.0. running mean: 8.973535661040504
resetting env. episode reward total was 8.0. running mean: 8.9638003044301
resetting env. episode reward total was 5.0. running mean: 8.924162301385799
resetting env. episode reward total was 11.0. running mean: 8.944920678371

resetting env. episode reward total was 16.0. running mean: 9.238772978015973
resetting env. episode reward total was 3.0. running mean: 9.176385248235812
resetting env. episode reward total was 15.0. running mean: 9.234621395753454
resetting env. episode reward total was 5.0. running mean: 9.19227518179592
resetting env. episode reward total was 4.0. running mean: 9.140352429977959
resetting env. episode reward total was 7.0. running mean: 9.11894890567818
resetting env. episode reward total was 9.0. running mean: 9.117759416621396
resetting env. episode reward total was 14.0. running mean: 9.166581822455184
resetting env. episode reward total was 8.0. running mean: 9.154916004230632
resetting env. episode reward total was 6.0. running mean: 9.123366844188325
resetting env. episode reward total was 7.0. running mean: 9.102133175746442
resetting env. episode reward total was 16.0. running mean: 9.171111843988978
resetting env. episode reward total was 10.0. running mean: 9.179400725549

resetting env. episode reward total was 18.0. running mean: 8.948937213374304
resetting env. episode reward total was 8.0. running mean: 8.939447841240561
resetting env. episode reward total was 13.0. running mean: 8.980053362828157
resetting env. episode reward total was 13.0. running mean: 9.020252829199876
resetting env. episode reward total was 6.0. running mean: 8.990050300907878
resetting env. episode reward total was 4.0. running mean: 8.9401497978988
resetting env. episode reward total was 14.0. running mean: 8.990748299919812
resetting env. episode reward total was 10.0. running mean: 9.000840816920613
resetting env. episode reward total was 13.0. running mean: 9.040832408751408
resetting env. episode reward total was 11.0. running mean: 9.060424084663893
resetting env. episode reward total was 6.0. running mean: 9.029819843817254
resetting env. episode reward total was 15.0. running mean: 9.089521645379081
resetting env. episode reward total was 14.0. running mean: 9.13862642

resetting env. episode reward total was 9.0. running mean: 9.014080574984366
resetting env. episode reward total was 12.0. running mean: 9.043939769234521
resetting env. episode reward total was 11.0. running mean: 9.063500371542176
resetting env. episode reward total was 10.0. running mean: 9.072865367826754
resetting env. episode reward total was 1.0. running mean: 8.992136714148486
resetting env. episode reward total was 13.0. running mean: 9.032215347007002
resetting env. episode reward total was 10.0. running mean: 9.04189319353693
resetting env. episode reward total was 6.0. running mean: 9.011474261601562
resetting env. episode reward total was -5.0. running mean: 8.871359518985544
resetting env. episode reward total was 10.0. running mean: 8.882645923795689
resetting env. episode reward total was 11.0. running mean: 8.903819464557731
resetting env. episode reward total was 17.0. running mean: 8.984781269912153
resetting env. episode reward total was 12.0. running mean: 9.014933

resetting env. episode reward total was 2.0. running mean: 8.840777519464268
resetting env. episode reward total was 7.0. running mean: 8.822369744269626
resetting env. episode reward total was 10.0. running mean: 8.834146046826929
resetting env. episode reward total was 7.0. running mean: 8.81580458635866
resetting env. episode reward total was 12.0. running mean: 8.847646540495072
resetting env. episode reward total was 15.0. running mean: 8.909170075090122
resetting env. episode reward total was 7.0. running mean: 8.890078374339222
resetting env. episode reward total was 5.0. running mean: 8.85117759059583
resetting env. episode reward total was 16.0. running mean: 8.922665814689871
resetting env. episode reward total was 11.0. running mean: 8.943439156542972
resetting env. episode reward total was 9.0. running mean: 8.944004764977542
resetting env. episode reward total was 8.0. running mean: 8.934564717327767
resetting env. episode reward total was 4.0. running mean: 8.885219070154

resetting env. episode reward total was 10.0. running mean: 9.057070972223057
resetting env. episode reward total was 15.0. running mean: 9.116500262500827
resetting env. episode reward total was 15.0. running mean: 9.175335259875819
resetting env. episode reward total was 9.0. running mean: 9.173581907277061
resetting env. episode reward total was 17.0. running mean: 9.25184608820429
resetting env. episode reward total was 5.0. running mean: 9.209327627322248
resetting env. episode reward total was 9.0. running mean: 9.207234351049024
resetting env. episode reward total was 7.0. running mean: 9.185162007538533
resetting env. episode reward total was 11.0. running mean: 9.203310387463148
resetting env. episode reward total was 14.0. running mean: 9.251277283588516
resetting env. episode reward total was 9.0. running mean: 9.24876451075263
resetting env. episode reward total was 12.0. running mean: 9.276276865645103
resetting env. episode reward total was 10.0. running mean: 9.283514096

resetting env. episode reward total was 7.0. running mean: 9.287718300908276
resetting env. episode reward total was 12.0. running mean: 9.314841117899192
resetting env. episode reward total was 14.0. running mean: 9.3616927067202
resetting env. episode reward total was 8.0. running mean: 9.348075779652998
resetting env. episode reward total was 9.0. running mean: 9.344595021856467
resetting env. episode reward total was 8.0. running mean: 9.331149071637903
resetting env. episode reward total was 14.0. running mean: 9.377837580921524
resetting env. episode reward total was 11.0. running mean: 9.394059205112308
resetting env. episode reward total was 9.0. running mean: 9.390118613061185
resetting env. episode reward total was 10.0. running mean: 9.396217426930573
resetting env. episode reward total was 16.0. running mean: 9.462255252661267
resetting env. episode reward total was 4.0. running mean: 9.407632700134654
resetting env. episode reward total was 11.0. running mean: 9.4235563731

resetting env. episode reward total was 13.0. running mean: 9.24582364890233
resetting env. episode reward total was 14.0. running mean: 9.293365412413308
resetting env. episode reward total was -2.0. running mean: 9.180431758289176
resetting env. episode reward total was 6.0. running mean: 9.148627440706285
resetting env. episode reward total was -5.0. running mean: 9.007141166299222
resetting env. episode reward total was 6.0. running mean: 8.977069754636231
resetting env. episode reward total was 10.0. running mean: 8.987299057089869
resetting env. episode reward total was 4.0. running mean: 8.93742606651897
resetting env. episode reward total was 11.0. running mean: 8.95805180585378
resetting env. episode reward total was 2.0. running mean: 8.888471287795241
resetting env. episode reward total was 7.0. running mean: 8.869586574917289
resetting env. episode reward total was 14.0. running mean: 8.920890709168116
resetting env. episode reward total was 4.0. running mean: 8.87168180207

resetting env. episode reward total was 12.0. running mean: 8.879071232767512
resetting env. episode reward total was 17.0. running mean: 8.960280520439836
resetting env. episode reward total was 14.0. running mean: 9.010677715235438
resetting env. episode reward total was 13.0. running mean: 9.050570938083085
resetting env. episode reward total was 9.0. running mean: 9.050065228702254
resetting env. episode reward total was 15.0. running mean: 9.10956457641523
resetting env. episode reward total was 3.0. running mean: 9.048468930651078
resetting env. episode reward total was 16.0. running mean: 9.117984241344567
resetting env. episode reward total was -6.0. running mean: 8.96680439893112
resetting env. episode reward total was 12.0. running mean: 8.997136354941809
resetting env. episode reward total was 9.0. running mean: 8.99716499139239
resetting env. episode reward total was 8.0. running mean: 8.987193341478466
resetting env. episode reward total was 6.0. running mean: 8.9573214080

resetting env. episode reward total was 15.0. running mean: 9.297946978131264
resetting env. episode reward total was 15.0. running mean: 9.354967508349953
resetting env. episode reward total was 9.0. running mean: 9.351417833266453
resetting env. episode reward total was 8.0. running mean: 9.337903654933788
resetting env. episode reward total was 7.0. running mean: 9.31452461838445
resetting env. episode reward total was 12.0. running mean: 9.341379372200604
resetting env. episode reward total was 10.0. running mean: 9.347965578478599
resetting env. episode reward total was 16.0. running mean: 9.414485922693812
resetting env. episode reward total was 7.0. running mean: 9.390341063466874
resetting env. episode reward total was 4.0. running mean: 9.336437652832204
resetting env. episode reward total was 10.0. running mean: 9.343073276303882
resetting env. episode reward total was 6.0. running mean: 9.309642543540843
resetting env. episode reward total was 6.0. running mean: 9.2765461181

resetting env. episode reward total was 13.0. running mean: 9.39965410408817
resetting env. episode reward total was 13.0. running mean: 9.435657563047288
resetting env. episode reward total was 13.0. running mean: 9.471300987416816
resetting env. episode reward total was 17.0. running mean: 9.546587977542648
resetting env. episode reward total was 11.0. running mean: 9.561122097767221
resetting env. episode reward total was 11.0. running mean: 9.575510876789549
resetting env. episode reward total was 13.0. running mean: 9.609755768021655
resetting env. episode reward total was 8.0. running mean: 9.593658210341438
resetting env. episode reward total was 10.0. running mean: 9.597721628238023
resetting env. episode reward total was 13.0. running mean: 9.631744411955644
resetting env. episode reward total was 12.0. running mean: 9.655426967836087
resetting env. episode reward total was 13.0. running mean: 9.688872698157727
resetting env. episode reward total was 12.0. running mean: 9.7119

resetting env. episode reward total was 13.0. running mean: 9.661365875645549
resetting env. episode reward total was 8.0. running mean: 9.644752216889094
resetting env. episode reward total was 15.0. running mean: 9.698304694720203
resetting env. episode reward total was 14.0. running mean: 9.741321647773002
resetting env. episode reward total was 19.0. running mean: 9.833908431295272
resetting env. episode reward total was 13.0. running mean: 9.865569346982321
resetting env. episode reward total was 16.0. running mean: 9.926913653512498
resetting env. episode reward total was 9.0. running mean: 9.917644516977372
resetting env. episode reward total was 4.0. running mean: 9.858468071807598
resetting env. episode reward total was 13.0. running mean: 9.889883391089523
resetting env. episode reward total was 10.0. running mean: 9.890984557178626
resetting env. episode reward total was 14.0. running mean: 9.932074711606841
resetting env. episode reward total was 15.0. running mean: 9.98275

resetting env. episode reward total was 10.0. running mean: 9.953574067977684
resetting env. episode reward total was 15.0. running mean: 10.004038327297907
resetting env. episode reward total was 7.0. running mean: 9.973997944024928
resetting env. episode reward total was 13.0. running mean: 10.00425796458468
resetting env. episode reward total was 7.0. running mean: 9.974215384938834
resetting env. episode reward total was 11.0. running mean: 9.984473231089444
resetting env. episode reward total was 19.0. running mean: 10.07462849877855
resetting env. episode reward total was 14.0. running mean: 10.113882213790765
resetting env. episode reward total was 14.0. running mean: 10.152743391652857
resetting env. episode reward total was 5.0. running mean: 10.10121595773633
resetting env. episode reward total was 15.0. running mean: 10.150203798158966
resetting env. episode reward total was 6.0. running mean: 10.108701760177377
resetting env. episode reward total was 17.0. running mean: 10.

resetting env. episode reward total was 11.0. running mean: 9.779898599837875
resetting env. episode reward total was 10.0. running mean: 9.782099613839495
resetting env. episode reward total was 14.0. running mean: 9.8242786177011
resetting env. episode reward total was 12.0. running mean: 9.84603583152409
resetting env. episode reward total was 2.0. running mean: 9.767575473208847
resetting env. episode reward total was 9.0. running mean: 9.759899718476758
resetting env. episode reward total was 12.0. running mean: 9.782300721291989
resetting env. episode reward total was 7.0. running mean: 9.75447771407907
resetting env. episode reward total was 11.0. running mean: 9.766932936938279
resetting env. episode reward total was 13.0. running mean: 9.799263607568896
resetting env. episode reward total was 11.0. running mean: 9.811270971493206
resetting env. episode reward total was 7.0. running mean: 9.783158261778274
resetting env. episode reward total was 17.0. running mean: 9.8553266791

resetting env. episode reward total was 13.0. running mean: 10.135952945123915
resetting env. episode reward total was 8.0. running mean: 10.114593415672676
resetting env. episode reward total was 8.0. running mean: 10.09344748151595
resetting env. episode reward total was 11.0. running mean: 10.10251300670079
resetting env. episode reward total was 10.0. running mean: 10.101487876633781
resetting env. episode reward total was 15.0. running mean: 10.150472997867444
resetting env. episode reward total was 17.0. running mean: 10.21896826788877
resetting env. episode reward total was 7.0. running mean: 10.18677858520988
resetting env. episode reward total was 12.0. running mean: 10.204910799357782
resetting env. episode reward total was 11.0. running mean: 10.212861691364203
resetting env. episode reward total was 14.0. running mean: 10.250733074450562
resetting env. episode reward total was 13.0. running mean: 10.278225743706056
resetting env. episode reward total was 1.0. running mean: 

resetting env. episode reward total was 7.0. running mean: 10.039824214297745
resetting env. episode reward total was 10.0. running mean: 10.039425972154767
resetting env. episode reward total was 11.0. running mean: 10.04903171243322
resetting env. episode reward total was 9.0. running mean: 10.038541395308886
resetting env. episode reward total was 15.0. running mean: 10.088155981355797
resetting env. episode reward total was 5.0. running mean: 10.03727442154224
resetting env. episode reward total was 1.0. running mean: 9.946901677326817
resetting env. episode reward total was 6.0. running mean: 9.90743266055355
resetting env. episode reward total was 14.0. running mean: 9.948358333948015
resetting env. episode reward total was 13.0. running mean: 9.978874750608535
resetting env. episode reward total was 8.0. running mean: 9.959086003102449
resetting env. episode reward total was 13.0. running mean: 9.989495143071425
resetting env. episode reward total was 16.0. running mean: 10.0496

resetting env. episode reward total was 13.0. running mean: 10.051928953596539
resetting env. episode reward total was 9.0. running mean: 10.041409664060573
resetting env. episode reward total was 12.0. running mean: 10.060995567419967
resetting env. episode reward total was 10.0. running mean: 10.060385611745767
resetting env. episode reward total was 3.0. running mean: 9.989781755628309
resetting env. episode reward total was 16.0. running mean: 10.049883938072027
resetting env. episode reward total was 15.0. running mean: 10.099385098691307
resetting env. episode reward total was 5.0. running mean: 10.048391247704394
resetting env. episode reward total was 12.0. running mean: 10.06790733522735
resetting env. episode reward total was 4.0. running mean: 10.007228261875076
resetting env. episode reward total was 10.0. running mean: 10.007155979256325
resetting env. episode reward total was 12.0. running mean: 10.02708441946376
resetting env. episode reward total was 8.0. running mean: 

resetting env. episode reward total was 10.0. running mean: 9.867062625222502
resetting env. episode reward total was 4.0. running mean: 9.808391998970276
resetting env. episode reward total was 12.0. running mean: 9.830308078980572
resetting env. episode reward total was 13.0. running mean: 9.862004998190766
resetting env. episode reward total was 15.0. running mean: 9.91338494820886
resetting env. episode reward total was 8.0. running mean: 9.894251098726771
resetting env. episode reward total was 7.0. running mean: 9.865308587739504
resetting env. episode reward total was 14.0. running mean: 9.90665550186211
resetting env. episode reward total was 13.0. running mean: 9.937588946843489
resetting env. episode reward total was 4.0. running mean: 9.878213057375053
resetting env. episode reward total was 14.0. running mean: 9.919430926801303
resetting env. episode reward total was 5.0. running mean: 9.87023661753329
resetting env. episode reward total was 8.0. running mean: 9.85153425135

resetting env. episode reward total was 4.0. running mean: 9.308911074628321
resetting env. episode reward total was 5.0. running mean: 9.265821963882038
resetting env. episode reward total was 4.0. running mean: 9.213163744243216
resetting env. episode reward total was 16.0. running mean: 9.281032106800785
resetting env. episode reward total was 12.0. running mean: 9.308221785732776
resetting env. episode reward total was 1.0. running mean: 9.225139567875448
resetting env. episode reward total was 17.0. running mean: 9.302888172196694
resetting env. episode reward total was 10.0. running mean: 9.309859290474726
resetting env. episode reward total was 11.0. running mean: 9.326760697569979
resetting env. episode reward total was 8.0. running mean: 9.31349309059428
resetting env. episode reward total was 10.0. running mean: 9.320358159688336
resetting env. episode reward total was 3.0. running mean: 9.257154578091452
resetting env. episode reward total was 11.0. running mean: 9.274583032

resetting env. episode reward total was 16.0. running mean: 9.667427111845369
resetting env. episode reward total was 9.0. running mean: 9.660752840726914
resetting env. episode reward total was 13.0. running mean: 9.694145312319646
resetting env. episode reward total was 6.0. running mean: 9.65720385919645
resetting env. episode reward total was 8.0. running mean: 9.640631820604485
resetting env. episode reward total was 10.0. running mean: 9.64422550239844
resetting env. episode reward total was 19.0. running mean: 9.737783247374455
resetting env. episode reward total was 10.0. running mean: 9.74040541490071
resetting env. episode reward total was 15.0. running mean: 9.793001360751703
resetting env. episode reward total was 8.0. running mean: 9.775071347144186
resetting env. episode reward total was 9.0. running mean: 9.767320633672744
resetting env. episode reward total was 10.0. running mean: 9.769647427336016
resetting env. episode reward total was 13.0. running mean: 9.8019509530

resetting env. episode reward total was 14.0. running mean: 9.383048170851383
resetting env. episode reward total was -9.0. running mean: 9.19921768914287
resetting env. episode reward total was 8.0. running mean: 9.187225512251441
resetting env. episode reward total was 2.0. running mean: 9.115353257128927
resetting env. episode reward total was 12.0. running mean: 9.144199724557637
resetting env. episode reward total was 8.0. running mean: 9.132757727312061
resetting env. episode reward total was 14.0. running mean: 9.18143015003894
resetting env. episode reward total was 15.0. running mean: 9.23961584853855
resetting env. episode reward total was 13.0. running mean: 9.277219690053165
resetting env. episode reward total was 10.0. running mean: 9.284447493152634
resetting env. episode reward total was 16.0. running mean: 9.351603018221107
resetting env. episode reward total was 5.0. running mean: 9.308086988038896
resetting env. episode reward total was 10.0. running mean: 9.315006118

resetting env. episode reward total was 11.0. running mean: 9.8783277350044
resetting env. episode reward total was 6.0. running mean: 9.839544457654355
resetting env. episode reward total was 9.0. running mean: 9.83114901307781
resetting env. episode reward total was 8.0. running mean: 9.812837522947033
resetting env. episode reward total was 12.0. running mean: 9.834709147717563
resetting env. episode reward total was 13.0. running mean: 9.866362056240387
resetting env. episode reward total was -7.0. running mean: 9.697698435677983
resetting env. episode reward total was 18.0. running mean: 9.780721451321202
resetting env. episode reward total was 6.0. running mean: 9.74291423680799
resetting env. episode reward total was 10.0. running mean: 9.74548509443991
resetting env. episode reward total was 9.0. running mean: 9.738030243495512
resetting env. episode reward total was 12.0. running mean: 9.760649941060557
resetting env. episode reward total was 12.0. running mean: 9.783043441649

resetting env. episode reward total was 10.0. running mean: 9.515344346355178
resetting env. episode reward total was 13.0. running mean: 9.550190902891627
resetting env. episode reward total was 11.0. running mean: 9.56468899386271
resetting env. episode reward total was 14.0. running mean: 9.609042103924082
resetting env. episode reward total was 7.0. running mean: 9.582951682884842
resetting env. episode reward total was 7.0. running mean: 9.557122166055994
resetting env. episode reward total was 8.0. running mean: 9.541550944395434
resetting env. episode reward total was 10.0. running mean: 9.546135434951479
resetting env. episode reward total was 8.0. running mean: 9.530674080601964
resetting env. episode reward total was 15.0. running mean: 9.585367339795944
resetting env. episode reward total was 6.0. running mean: 9.549513666397985
resetting env. episode reward total was 14.0. running mean: 9.594018529734006
resetting env. episode reward total was 3.0. running mean: 9.528078344

resetting env. episode reward total was 10.0. running mean: 9.865915309231227
resetting env. episode reward total was 9.0. running mean: 9.857256156138915
resetting env. episode reward total was 12.0. running mean: 9.878683594577526
resetting env. episode reward total was 7.0. running mean: 9.84989675863175
resetting env. episode reward total was 17.0. running mean: 9.921397791045433
resetting env. episode reward total was 11.0. running mean: 9.932183813134978
resetting env. episode reward total was 12.0. running mean: 9.952861975003628
resetting env. episode reward total was 10.0. running mean: 9.953333355253593
resetting env. episode reward total was 11.0. running mean: 9.963800021701056
resetting env. episode reward total was 15.0. running mean: 10.014162021484045
resetting env. episode reward total was 3.0. running mean: 9.944020401269205
resetting env. episode reward total was 8.0. running mean: 9.924580197256512
resetting env. episode reward total was 8.0. running mean: 9.9053343

resetting env. episode reward total was 10.0. running mean: 9.314407378214076
resetting env. episode reward total was 17.0. running mean: 9.391263304431936
resetting env. episode reward total was 16.0. running mean: 9.457350671387617
resetting env. episode reward total was 4.0. running mean: 9.40277716467374
resetting env. episode reward total was 4.0. running mean: 9.348749393027001
resetting env. episode reward total was 5.0. running mean: 9.305261899096731
resetting env. episode reward total was 16.0. running mean: 9.372209280105764
resetting env. episode reward total was 6.0. running mean: 9.338487187304708
resetting env. episode reward total was 18.0. running mean: 9.42510231543166
resetting env. episode reward total was 10.0. running mean: 9.430851292277342
resetting env. episode reward total was 7.0. running mean: 9.406542779354568
resetting env. episode reward total was 4.0. running mean: 9.352477351561022
resetting env. episode reward total was 10.0. running mean: 9.3589525780

resetting env. episode reward total was 15.0. running mean: 9.03000680656197
resetting env. episode reward total was 7.0. running mean: 9.00970673849635
resetting env. episode reward total was 10.0. running mean: 9.019609671111386
resetting env. episode reward total was 9.0. running mean: 9.019413574400271
resetting env. episode reward total was 12.0. running mean: 9.049219438656268
resetting env. episode reward total was 12.0. running mean: 9.078727244269704
resetting env. episode reward total was 8.0. running mean: 9.067939971827007
resetting env. episode reward total was 15.0. running mean: 9.127260572108737
resetting env. episode reward total was -1.0. running mean: 9.02598796638765
resetting env. episode reward total was 7.0. running mean: 9.005728086723773
resetting env. episode reward total was 4.0. running mean: 8.955670805856533
resetting env. episode reward total was 9.0. running mean: 8.956114097797968
resetting env. episode reward total was 10.0. running mean: 8.96655295681

resetting env. episode reward total was 9.0. running mean: 9.694178692959467
resetting env. episode reward total was 15.0. running mean: 9.747236906029872
resetting env. episode reward total was 16.0. running mean: 9.809764536969574
resetting env. episode reward total was 5.0. running mean: 9.761666891599878
resetting env. episode reward total was 11.0. running mean: 9.77405022268388
resetting env. episode reward total was 15.0. running mean: 9.82630972045704
resetting env. episode reward total was 15.0. running mean: 9.87804662325247
resetting env. episode reward total was 8.0. running mean: 9.859266157019945
resetting env. episode reward total was 5.0. running mean: 9.810673495449747
resetting env. episode reward total was 14.0. running mean: 9.85256676049525
resetting env. episode reward total was 5.0. running mean: 9.804041092890298
resetting env. episode reward total was 10.0. running mean: 9.806000681961395
resetting env. episode reward total was 10.0. running mean: 9.80794067514

resetting env. episode reward total was 12.0. running mean: 10.218400220921765
resetting env. episode reward total was 3.0. running mean: 10.146216218712548
resetting env. episode reward total was 10.0. running mean: 10.144754056525422
resetting env. episode reward total was 10.0. running mean: 10.143306515960166
resetting env. episode reward total was 11.0. running mean: 10.151873450800563
resetting env. episode reward total was 13.0. running mean: 10.180354716292559
resetting env. episode reward total was 10.0. running mean: 10.178551169129634
resetting env. episode reward total was 14.0. running mean: 10.216765657438337
resetting env. episode reward total was 15.0. running mean: 10.264598000863954
resetting env. episode reward total was 21.0. running mean: 10.371952020855314
resetting env. episode reward total was -1.0. running mean: 10.258232500646761
resetting env. episode reward total was 14.0. running mean: 10.295650175640294
resetting env. episode reward total was 8.0. running 

resetting env. episode reward total was 9.0. running mean: 10.505053679321351
resetting env. episode reward total was 6.0. running mean: 10.460003142528139
resetting env. episode reward total was 12.0. running mean: 10.475403111102857
resetting env. episode reward total was 9.0. running mean: 10.460649079991828
resetting env. episode reward total was 10.0. running mean: 10.45604258919191
resetting env. episode reward total was 11.0. running mean: 10.46148216329999
resetting env. episode reward total was 8.0. running mean: 10.436867341666991
resetting env. episode reward total was 6.0. running mean: 10.39249866825032
resetting env. episode reward total was 5.0. running mean: 10.338573681567818
resetting env. episode reward total was 10.0. running mean: 10.335187944752139
resetting env. episode reward total was 2.0. running mean: 10.251836065304618
resetting env. episode reward total was 11.0. running mean: 10.25931770465157
resetting env. episode reward total was 12.0. running mean: 10.

resetting env. episode reward total was 12.0. running mean: 10.193110207651431
resetting env. episode reward total was 11.0. running mean: 10.201179105574916
resetting env. episode reward total was 7.0. running mean: 10.169167314519166
resetting env. episode reward total was 13.0. running mean: 10.197475641373975
resetting env. episode reward total was 8.0. running mean: 10.175500884960236
resetting env. episode reward total was 5.0. running mean: 10.123745876110634
resetting env. episode reward total was 10.0. running mean: 10.122508417349527
resetting env. episode reward total was 7.0. running mean: 10.091283333176031
resetting env. episode reward total was 4.0. running mean: 10.03037049984427
resetting env. episode reward total was 4.0. running mean: 9.970066794845827
resetting env. episode reward total was -8.0. running mean: 9.79036612689737
resetting env. episode reward total was 13.0. running mean: 9.822462465628396
resetting env. episode reward total was 15.0. running mean: 9.8

resetting env. episode reward total was 12.0. running mean: 10.080660359942335
resetting env. episode reward total was 14.0. running mean: 10.119853756342913
resetting env. episode reward total was 12.0. running mean: 10.138655218779483
resetting env. episode reward total was 8.0. running mean: 10.117268666591688
resetting env. episode reward total was 12.0. running mean: 10.13609597992577
resetting env. episode reward total was 15.0. running mean: 10.184735020126512
resetting env. episode reward total was 17.0. running mean: 10.252887669925247
resetting env. episode reward total was 14.0. running mean: 10.290358793225995
resetting env. episode reward total was 14.0. running mean: 10.327455205293735
resetting env. episode reward total was 16.0. running mean: 10.384180653240797
resetting env. episode reward total was 10.0. running mean: 10.380338846708389
resetting env. episode reward total was 5.0. running mean: 10.326535458241306
resetting env. episode reward total was 9.0. running me

resetting env. episode reward total was 15.0. running mean: 10.555143681414838
resetting env. episode reward total was 10.0. running mean: 10.54959224460069
resetting env. episode reward total was 12.0. running mean: 10.564096322154683
resetting env. episode reward total was 13.0. running mean: 10.588455358933137
resetting env. episode reward total was -1.0. running mean: 10.472570805343805
resetting env. episode reward total was 3.0. running mean: 10.397845097290366
resetting env. episode reward total was 4.0. running mean: 10.333866646317462
resetting env. episode reward total was 15.0. running mean: 10.380527979854287
resetting env. episode reward total was 2.0. running mean: 10.296722700055744
resetting env. episode reward total was 13.0. running mean: 10.323755473055186
resetting env. episode reward total was 3.0. running mean: 10.250517918324634
resetting env. episode reward total was 8.0. running mean: 10.228012739141388
resetting env. episode reward total was 16.0. running mean

resetting env. episode reward total was 11.0. running mean: 9.997368762865424
resetting env. episode reward total was 11.0. running mean: 10.007395075236769
resetting env. episode reward total was 12.0. running mean: 10.0273211244844
resetting env. episode reward total was 5.0. running mean: 9.977047913239558
resetting env. episode reward total was 9.0. running mean: 9.967277434107162
resetting env. episode reward total was 7.0. running mean: 9.937604659766091
resetting env. episode reward total was 8.0. running mean: 9.91822861316843
resetting env. episode reward total was 7.0. running mean: 9.889046327036747
resetting env. episode reward total was 13.0. running mean: 9.92015586376638
resetting env. episode reward total was 9.0. running mean: 9.910954305128717
resetting env. episode reward total was 15.0. running mean: 9.96184476207743
resetting env. episode reward total was 10.0. running mean: 9.962226314456656
resetting env. episode reward total was 12.0. running mean: 9.98260405131

resetting env. episode reward total was 12.0. running mean: 9.643059583109869
resetting env. episode reward total was 12.0. running mean: 9.66662898727877
resetting env. episode reward total was 11.0. running mean: 9.679962697405982
resetting env. episode reward total was 9.0. running mean: 9.673163070431922
resetting env. episode reward total was 6.0. running mean: 9.636431439727604
resetting env. episode reward total was 3.0. running mean: 9.570067125330327
resetting env. episode reward total was 14.0. running mean: 9.614366454077025
resetting env. episode reward total was 10.0. running mean: 9.618222789536254
resetting env. episode reward total was 12.0. running mean: 9.642040561640892
resetting env. episode reward total was 11.0. running mean: 9.655620156024483
resetting env. episode reward total was 9.0. running mean: 9.649063954464237
resetting env. episode reward total was 13.0. running mean: 9.682573314919596
resetting env. episode reward total was 13.0. running mean: 9.7157475

resetting env. episode reward total was 14.0. running mean: 9.791141878932638
resetting env. episode reward total was 9.0. running mean: 9.783230460143312
resetting env. episode reward total was 8.0. running mean: 9.76539815554188
resetting env. episode reward total was 9.0. running mean: 9.75774417398646
resetting env. episode reward total was 12.0. running mean: 9.780166732246593
resetting env. episode reward total was 11.0. running mean: 9.792365064924127
resetting env. episode reward total was 14.0. running mean: 9.834441414274886
resetting env. episode reward total was 14.0. running mean: 9.876097000132138
resetting env. episode reward total was 10.0. running mean: 9.877336030130815
resetting env. episode reward total was 7.0. running mean: 9.848562669829507
resetting env. episode reward total was 13.0. running mean: 9.880077043131212
resetting env. episode reward total was 13.0. running mean: 9.9112762726999
resetting env. episode reward total was 13.0. running mean: 9.9421635099

resetting env. episode reward total was 14.0. running mean: 10.1984071565473
resetting env. episode reward total was 13.0. running mean: 10.226423084981827
resetting env. episode reward total was 7.0. running mean: 10.194158854132008
resetting env. episode reward total was 7.0. running mean: 10.162217265590687
resetting env. episode reward total was 10.0. running mean: 10.16059509293478
resetting env. episode reward total was -4.0. running mean: 10.018989142005433
resetting env. episode reward total was 9.0. running mean: 10.008799250585378
resetting env. episode reward total was 13.0. running mean: 10.038711258079525
resetting env. episode reward total was 15.0. running mean: 10.08832414549873
resetting env. episode reward total was 5.0. running mean: 10.037440904043743
resetting env. episode reward total was 5.0. running mean: 9.987066495003306
resetting env. episode reward total was 10.0. running mean: 9.987195830053272
resetting env. episode reward total was 10.0. running mean: 9.9

resetting env. episode reward total was 17.0. running mean: 10.039114792343502
resetting env. episode reward total was 14.0. running mean: 10.078723644420068
resetting env. episode reward total was 20.0. running mean: 10.177936407975865
resetting env. episode reward total was 10.0. running mean: 10.176157043896106
resetting env. episode reward total was 15.0. running mean: 10.224395473457145
resetting env. episode reward total was 10.0. running mean: 10.222151518722573
resetting env. episode reward total was 18.0. running mean: 10.299930003535346
resetting env. episode reward total was 13.0. running mean: 10.326930703499993
resetting env. episode reward total was 3.0. running mean: 10.253661396464993
resetting env. episode reward total was 15.0. running mean: 10.301124782500343
resetting env. episode reward total was 6.0. running mean: 10.25811353467534
resetting env. episode reward total was 4.0. running mean: 10.195532399328586
resetting env. episode reward total was 11.0. running me

resetting env. episode reward total was 9.0. running mean: 9.886324682416198
resetting env. episode reward total was 15.0. running mean: 9.937461435592036
resetting env. episode reward total was 14.0. running mean: 9.978086821236117
resetting env. episode reward total was 13.0. running mean: 10.008305953023756
resetting env. episode reward total was 15.0. running mean: 10.058222893493518
resetting env. episode reward total was 9.0. running mean: 10.047640664558582
resetting env. episode reward total was 10.0. running mean: 10.047164257912996
resetting env. episode reward total was 14.0. running mean: 10.086692615333867
resetting env. episode reward total was 12.0. running mean: 10.105825689180527
resetting env. episode reward total was 12.0. running mean: 10.12476743228872
resetting env. episode reward total was 15.0. running mean: 10.173519757965833
resetting env. episode reward total was 9.0. running mean: 10.161784560386174
resetting env. episode reward total was 5.0. running mean: 

resetting env. episode reward total was 14.0. running mean: 10.539959480244406
resetting env. episode reward total was 11.0. running mean: 10.544559885441961
resetting env. episode reward total was 9.0. running mean: 10.529114286587541
resetting env. episode reward total was 15.0. running mean: 10.573823143721667
resetting env. episode reward total was 7.0. running mean: 10.53808491228445
resetting env. episode reward total was 14.0. running mean: 10.572704063161606
resetting env. episode reward total was 13.0. running mean: 10.596977022529991
resetting env. episode reward total was 15.0. running mean: 10.641007252304691
resetting env. episode reward total was 9.0. running mean: 10.624597179781643
resetting env. episode reward total was 5.0. running mean: 10.568351207983827
resetting env. episode reward total was 8.0. running mean: 10.542667695903988
resetting env. episode reward total was 5.0. running mean: 10.48724101894495
resetting env. episode reward total was 11.0. running mean: 

resetting env. episode reward total was 11.0. running mean: 10.010760301976877
resetting env. episode reward total was 5.0. running mean: 9.960652698957109
resetting env. episode reward total was 11.0. running mean: 9.971046171967537
resetting env. episode reward total was 7.0. running mean: 9.941335710247861
resetting env. episode reward total was 13.0. running mean: 9.971922353145382
resetting env. episode reward total was 8.0. running mean: 9.95220312961393
resetting env. episode reward total was 12.0. running mean: 9.97268109831779
resetting env. episode reward total was 8.0. running mean: 9.952954287334611
resetting env. episode reward total was 7.0. running mean: 9.923424744461265
resetting env. episode reward total was 15.0. running mean: 9.974190497016652
resetting env. episode reward total was 4.0. running mean: 9.914448592046485
resetting env. episode reward total was -7.0. running mean: 9.745304106126019
resetting env. episode reward total was 8.0. running mean: 9.7278510650

resetting env. episode reward total was 13.0. running mean: 10.068413079831146
resetting env. episode reward total was 3.0. running mean: 9.997728949032833
resetting env. episode reward total was 7.0. running mean: 9.967751659542506
resetting env. episode reward total was 13.0. running mean: 9.99807414294708
resetting env. episode reward total was 11.0. running mean: 10.008093401517609
resetting env. episode reward total was 4.0. running mean: 9.948012467502432
resetting env. episode reward total was -3.0. running mean: 9.818532342827408
resetting env. episode reward total was 13.0. running mean: 9.850347019399136
resetting env. episode reward total was 10.0. running mean: 9.851843549205144
resetting env. episode reward total was 12.0. running mean: 9.873325113713092
resetting env. episode reward total was 14.0. running mean: 9.91459186257596
resetting env. episode reward total was 18.0. running mean: 9.9954459439502
resetting env. episode reward total was 8.0. running mean: 9.97549148

resetting env. episode reward total was 14.0. running mean: 9.918388148187542
resetting env. episode reward total was 13.0. running mean: 9.949204266705667
resetting env. episode reward total was 14.0. running mean: 9.98971222403861
resetting env. episode reward total was 9.0. running mean: 9.979815101798224
resetting env. episode reward total was 10.0. running mean: 9.980016950780241
resetting env. episode reward total was 15.0. running mean: 10.03021678127244
resetting env. episode reward total was 11.0. running mean: 10.039914613459715
resetting env. episode reward total was 15.0. running mean: 10.089515467325118
resetting env. episode reward total was 13.0. running mean: 10.118620312651867
resetting env. episode reward total was 20.0. running mean: 10.217434109525348
resetting env. episode reward total was -1.0. running mean: 10.105259768430095
resetting env. episode reward total was 13.0. running mean: 10.134207170745794
resetting env. episode reward total was 11.0. running mean: 

resetting env. episode reward total was 14.0. running mean: 10.231143320528476
resetting env. episode reward total was 12.0. running mean: 10.24883188732319
resetting env. episode reward total was 3.0. running mean: 10.176343568449957
resetting env. episode reward total was 10.0. running mean: 10.174580132765456
resetting env. episode reward total was 3.0. running mean: 10.102834331437801
resetting env. episode reward total was 6.0. running mean: 10.061805988123425
resetting env. episode reward total was 14.0. running mean: 10.10118792824219
resetting env. episode reward total was 10.0. running mean: 10.100176048959767
resetting env. episode reward total was 14.0. running mean: 10.13917428847017
resetting env. episode reward total was 8.0. running mean: 10.117782545585467
resetting env. episode reward total was 17.0. running mean: 10.186604720129612
resetting env. episode reward total was 18.0. running mean: 10.264738672928315
resetting env. episode reward total was 6.0. running mean: 

resetting env. episode reward total was 14.0. running mean: 10.346530696073945
resetting env. episode reward total was 14.0. running mean: 10.383065389113206
resetting env. episode reward total was 6.0. running mean: 10.339234735222075
resetting env. episode reward total was 13.0. running mean: 10.365842387869854
resetting env. episode reward total was 11.0. running mean: 10.372183963991155
resetting env. episode reward total was 7.0. running mean: 10.338462124351244
resetting env. episode reward total was 12.0. running mean: 10.355077503107731
resetting env. episode reward total was 15.0. running mean: 10.401526728076654
resetting env. episode reward total was 6.0. running mean: 10.357511460795887
resetting env. episode reward total was 13.0. running mean: 10.383936346187928
resetting env. episode reward total was 9.0. running mean: 10.370096982726048
resetting env. episode reward total was 6.0. running mean: 10.326396012898789
resetting env. episode reward total was 16.0. running mea

resetting env. episode reward total was 16.0. running mean: 10.160447488261045
resetting env. episode reward total was -2.0. running mean: 10.038843013378436
resetting env. episode reward total was 9.0. running mean: 10.02845458324465
resetting env. episode reward total was 8.0. running mean: 10.008170037412205
resetting env. episode reward total was 15.0. running mean: 10.058088337038082
resetting env. episode reward total was 13.0. running mean: 10.087507453667703
resetting env. episode reward total was 15.0. running mean: 10.136632379131026
resetting env. episode reward total was 18.0. running mean: 10.215266055339715
resetting env. episode reward total was 8.0. running mean: 10.193113394786318
resetting env. episode reward total was -3.0. running mean: 10.061182260838455
resetting env. episode reward total was 8.0. running mean: 10.040570438230072
resetting env. episode reward total was 12.0. running mean: 10.06016473384777
resetting env. episode reward total was 6.0. running mean:

resetting env. episode reward total was 15.0. running mean: 10.567530063581081
resetting env. episode reward total was 11.0. running mean: 10.57185476294527
resetting env. episode reward total was 12.0. running mean: 10.586136215315817
resetting env. episode reward total was 14.0. running mean: 10.620274853162659
resetting env. episode reward total was 9.0. running mean: 10.604072104631031
resetting env. episode reward total was 10.0. running mean: 10.59803138358472
resetting env. episode reward total was 9.0. running mean: 10.582051069748873
resetting env. episode reward total was 12.0. running mean: 10.596230559051383
resetting env. episode reward total was 4.0. running mean: 10.530268253460868
resetting env. episode reward total was 7.0. running mean: 10.49496557092626
resetting env. episode reward total was 16.0. running mean: 10.550015915216997
resetting env. episode reward total was 8.0. running mean: 10.524515756064828
resetting env. episode reward total was 1.0. running mean: 1

resetting env. episode reward total was 10.0. running mean: 10.49632033668614
resetting env. episode reward total was 12.0. running mean: 10.511357133319278
resetting env. episode reward total was 18.0. running mean: 10.586243561986084
resetting env. episode reward total was 8.0. running mean: 10.560381126366224
resetting env. episode reward total was 7.0. running mean: 10.524777315102561
resetting env. episode reward total was 8.0. running mean: 10.499529541951535
resetting env. episode reward total was 15.0. running mean: 10.54453424653202
resetting env. episode reward total was 7.0. running mean: 10.5090889040667
resetting env. episode reward total was 13.0. running mean: 10.533998015026034
resetting env. episode reward total was 14.0. running mean: 10.568658034875774
resetting env. episode reward total was 16.0. running mean: 10.622971454527017
resetting env. episode reward total was 7.0. running mean: 10.586741739981747
resetting env. episode reward total was 7.0. running mean: 10

resetting env. episode reward total was 8.0. running mean: 10.496175979478993
resetting env. episode reward total was 17.0. running mean: 10.561214219684203
resetting env. episode reward total was 10.0. running mean: 10.555602077487361
resetting env. episode reward total was 7.0. running mean: 10.520046056712488
resetting env. episode reward total was 7.0. running mean: 10.484845596145362
resetting env. episode reward total was 13.0. running mean: 10.50999714018391
resetting env. episode reward total was 12.0. running mean: 10.524897168782068
resetting env. episode reward total was 8.0. running mean: 10.499648197094247
resetting env. episode reward total was 10.0. running mean: 10.494651715123304
resetting env. episode reward total was 9.0. running mean: 10.47970519797207
resetting env. episode reward total was 9.0. running mean: 10.464908145992348
resetting env. episode reward total was 12.0. running mean: 10.480259064532424
resetting env. episode reward total was 7.0. running mean: 1

resetting env. episode reward total was 10.0. running mean: 10.332585756637691
resetting env. episode reward total was 9.0. running mean: 10.319259899071314
resetting env. episode reward total was 10.0. running mean: 10.3160673000806
resetting env. episode reward total was 9.0. running mean: 10.302906627079794
resetting env. episode reward total was 15.0. running mean: 10.349877560808997
resetting env. episode reward total was 16.0. running mean: 10.406378785200907
resetting env. episode reward total was 12.0. running mean: 10.422314997348897
resetting env. episode reward total was 14.0. running mean: 10.458091847375409
resetting env. episode reward total was 17.0. running mean: 10.523510928901654
resetting env. episode reward total was 7.0. running mean: 10.488275819612637
resetting env. episode reward total was 11.0. running mean: 10.49339306141651
resetting env. episode reward total was 1.0. running mean: 10.398459130802346
resetting env. episode reward total was 11.0. running mean:

resetting env. episode reward total was 12.0. running mean: 10.607222413080583
resetting env. episode reward total was 8.0. running mean: 10.581150188949778
resetting env. episode reward total was 9.0. running mean: 10.56533868706028
resetting env. episode reward total was 16.0. running mean: 10.619685300189676
resetting env. episode reward total was 9.0. running mean: 10.603488447187779
resetting env. episode reward total was 5.0. running mean: 10.547453562715901
resetting env. episode reward total was 18.0. running mean: 10.621979027088742
resetting env. episode reward total was 15.0. running mean: 10.665759236817854
resetting env. episode reward total was 7.0. running mean: 10.629101644449676
resetting env. episode reward total was 17.0. running mean: 10.69281062800518
resetting env. episode reward total was 2.0. running mean: 10.605882521725128
resetting env. episode reward total was 12.0. running mean: 10.619823696507876
resetting env. episode reward total was 9.0. running mean: 1

resetting env. episode reward total was 10.0. running mean: 10.624108564762583
resetting env. episode reward total was 12.0. running mean: 10.637867479114956
resetting env. episode reward total was 18.0. running mean: 10.711488804323805
resetting env. episode reward total was 11.0. running mean: 10.714373916280566
resetting env. episode reward total was 7.0. running mean: 10.67723017711776
resetting env. episode reward total was 13.0. running mean: 10.700457875346583
resetting env. episode reward total was 17.0. running mean: 10.763453296593116
resetting env. episode reward total was 12.0. running mean: 10.775818763627184
resetting env. episode reward total was 7.0. running mean: 10.738060575990913
resetting env. episode reward total was 15.0. running mean: 10.780679970231004
resetting env. episode reward total was 12.0. running mean: 10.792873170528694
resetting env. episode reward total was 9.0. running mean: 10.774944438823406
resetting env. episode reward total was 10.0. running me

resetting env. episode reward total was 12.0. running mean: 11.44479781406851
resetting env. episode reward total was 7.0. running mean: 11.400349835927825
resetting env. episode reward total was -2.0. running mean: 11.266346337568548
resetting env. episode reward total was 17.0. running mean: 11.323682874192862
resetting env. episode reward total was 7.0. running mean: 11.280446045450933
resetting env. episode reward total was 10.0. running mean: 11.267641584996424
resetting env. episode reward total was 13.0. running mean: 11.28496516914646
resetting env. episode reward total was 16.0. running mean: 11.332115517454996
resetting env. episode reward total was 14.0. running mean: 11.358794362280447
resetting env. episode reward total was 17.0. running mean: 11.415206418657643
resetting env. episode reward total was 20.0. running mean: 11.501054354471066
resetting env. episode reward total was 15.0. running mean: 11.536043810926355
resetting env. episode reward total was 9.0. running mea

resetting env. episode reward total was 15.0. running mean: 11.165107942778707
resetting env. episode reward total was 10.0. running mean: 11.15345686335092
resetting env. episode reward total was 10.0. running mean: 11.14192229471741
resetting env. episode reward total was 9.0. running mean: 11.120503071770237
resetting env. episode reward total was 11.0. running mean: 11.119298041052534
resetting env. episode reward total was 11.0. running mean: 11.118105060642009
resetting env. episode reward total was 10.0. running mean: 11.106924010035588
resetting env. episode reward total was 11.0. running mean: 11.105854769935231
resetting env. episode reward total was 10.0. running mean: 11.094796222235878
resetting env. episode reward total was 8.0. running mean: 11.06384826001352
resetting env. episode reward total was 14.0. running mean: 11.093209777413385
resetting env. episode reward total was 16.0. running mean: 11.142277679639252
resetting env. episode reward total was 7.0. running mean

resetting env. episode reward total was 9.0. running mean: 11.353842827213914
resetting env. episode reward total was 10.0. running mean: 11.340304398941774
resetting env. episode reward total was 9.0. running mean: 11.316901354952355
resetting env. episode reward total was 10.0. running mean: 11.303732341402831
resetting env. episode reward total was 8.0. running mean: 11.270695017988803
resetting env. episode reward total was 2.0. running mean: 11.177988067808915
resetting env. episode reward total was 7.0. running mean: 11.136208187130826
resetting env. episode reward total was 15.0. running mean: 11.174846105259519
resetting env. episode reward total was 14.0. running mean: 11.203097644206924
resetting env. episode reward total was 6.0. running mean: 11.151066667764855
resetting env. episode reward total was 2.0. running mean: 11.059556001087206
resetting env. episode reward total was 8.0. running mean: 11.028960441076334
resetting env. episode reward total was 11.0. running mean: 

resetting env. episode reward total was 17.0. running mean: 10.985676035362728
resetting env. episode reward total was 18.0. running mean: 11.0558192750091
resetting env. episode reward total was 10.0. running mean: 11.045261082259008
resetting env. episode reward total was 10.0. running mean: 11.034808471436419
resetting env. episode reward total was 12.0. running mean: 11.044460386722054
resetting env. episode reward total was 13.0. running mean: 11.064015782854835
resetting env. episode reward total was 19.0. running mean: 11.143375625026286
resetting env. episode reward total was 9.0. running mean: 11.121941868776023
resetting env. episode reward total was 13.0. running mean: 11.140722450088264
resetting env. episode reward total was 10.0. running mean: 11.12931522558738
resetting env. episode reward total was 11.0. running mean: 11.128022073331506
resetting env. episode reward total was 8.0. running mean: 11.096741852598191
resetting env. episode reward total was 9.0. running mean

resetting env. episode reward total was 10.0. running mean: 11.028830223633813
resetting env. episode reward total was 11.0. running mean: 11.028541921397474
resetting env. episode reward total was 12.0. running mean: 11.038256502183499
resetting env. episode reward total was 11.0. running mean: 11.037873937161663
resetting env. episode reward total was 19.0. running mean: 11.117495197790046
resetting env. episode reward total was 16.0. running mean: 11.166320245812145
resetting env. episode reward total was 13.0. running mean: 11.184657043354024
resetting env. episode reward total was 18.0. running mean: 11.252810472920483
resetting env. episode reward total was 13.0. running mean: 11.27028236819128
resetting env. episode reward total was 12.0. running mean: 11.277579544509367
resetting env. episode reward total was 4.0. running mean: 11.204803749064272
resetting env. episode reward total was 17.0. running mean: 11.262755711573629
resetting env. episode reward total was 16.0. running 

resetting env. episode reward total was 7.0. running mean: 11.2102726761577
resetting env. episode reward total was 11.0. running mean: 11.208169949396122
resetting env. episode reward total was 9.0. running mean: 11.186088249902161
resetting env. episode reward total was 9.0. running mean: 11.16422736740314
resetting env. episode reward total was 7.0. running mean: 11.122585093729109
resetting env. episode reward total was 10.0. running mean: 11.111359242791817
resetting env. episode reward total was 10.0. running mean: 11.100245650363899
resetting env. episode reward total was 7.0. running mean: 11.05924319386026
resetting env. episode reward total was 8.0. running mean: 11.028650761921657
resetting env. episode reward total was 7.0. running mean: 10.988364254302441
resetting env. episode reward total was 14.0. running mean: 11.018480611759417
resetting env. episode reward total was 6.0. running mean: 10.968295805641823
resetting env. episode reward total was 15.0. running mean: 11.0

resetting env. episode reward total was 17.0. running mean: 11.104743142165487
resetting env. episode reward total was 15.0. running mean: 11.143695710743833
resetting env. episode reward total was 18.0. running mean: 11.212258753636394
resetting env. episode reward total was 14.0. running mean: 11.240136166100031
resetting env. episode reward total was 19.0. running mean: 11.31773480443903
resetting env. episode reward total was 10.0. running mean: 11.30455745639464
resetting env. episode reward total was 9.0. running mean: 11.281511881830694
resetting env. episode reward total was 15.0. running mean: 11.318696763012387
resetting env. episode reward total was 14.0. running mean: 11.345509795382263
resetting env. episode reward total was 13.0. running mean: 11.36205469742844
resetting env. episode reward total was 17.0. running mean: 11.418434150454157
resetting env. episode reward total was 12.0. running mean: 11.424249808949615
resetting env. episode reward total was 12.0. running me

resetting env. episode reward total was 8.0. running mean: 11.307740117307947
resetting env. episode reward total was 15.0. running mean: 11.344662716134868
resetting env. episode reward total was 13.0. running mean: 11.36121608897352
resetting env. episode reward total was 9.0. running mean: 11.337603928083785
resetting env. episode reward total was 9.0. running mean: 11.314227888802947
resetting env. episode reward total was 15.0. running mean: 11.351085609914918
resetting env. episode reward total was 16.0. running mean: 11.39757475381577
resetting env. episode reward total was -11.0. running mean: 11.173599006277612
resetting env. episode reward total was 5.0. running mean: 11.111863016214837
resetting env. episode reward total was 16.0. running mean: 11.16074438605269
resetting env. episode reward total was 15.0. running mean: 11.199136942192164
resetting env. episode reward total was 16.0. running mean: 11.247145572770242
resetting env. episode reward total was 15.0. running mean

resetting env. episode reward total was 16.0. running mean: 11.27251141799994
resetting env. episode reward total was 13.0. running mean: 11.289786303819941
resetting env. episode reward total was 14.0. running mean: 11.316888440781742
resetting env. episode reward total was 15.0. running mean: 11.353719556373925
resetting env. episode reward total was 19.0. running mean: 11.430182360810186
resetting env. episode reward total was 6.0. running mean: 11.375880537202084
resetting env. episode reward total was 17.0. running mean: 11.432121731830064
resetting env. episode reward total was 8.0. running mean: 11.397800514511763
resetting env. episode reward total was 14.0. running mean: 11.423822509366646
resetting env. episode reward total was 12.0. running mean: 11.429584284272979
resetting env. episode reward total was 15.0. running mean: 11.46528844143025
resetting env. episode reward total was 15.0. running mean: 11.500635557015947
resetting env. episode reward total was 19.0. running me

resetting env. episode reward total was 14.0. running mean: 11.329758482301596
resetting env. episode reward total was 11.0. running mean: 11.32646089747858
resetting env. episode reward total was 7.0. running mean: 11.283196288503794
resetting env. episode reward total was 7.0. running mean: 11.240364325618756
resetting env. episode reward total was 17.0. running mean: 11.297960682362568
resetting env. episode reward total was 16.0. running mean: 11.344981075538943
resetting env. episode reward total was 6.0. running mean: 11.291531264783554
resetting env. episode reward total was 14.0. running mean: 11.318615952135719
resetting env. episode reward total was 11.0. running mean: 11.315429792614362
resetting env. episode reward total was 7.0. running mean: 11.272275494688218
resetting env. episode reward total was 5.0. running mean: 11.209552739741337
resetting env. episode reward total was 12.0. running mean: 11.217457212343923
resetting env. episode reward total was 16.0. running mean

resetting env. episode reward total was 1.0. running mean: 10.896291350085445
resetting env. episode reward total was 9.0. running mean: 10.87732843658459
resetting env. episode reward total was 15.0. running mean: 10.918555152218744
resetting env. episode reward total was 9.0. running mean: 10.899369600696556
resetting env. episode reward total was 8.0. running mean: 10.87037590468959
resetting env. episode reward total was 13.0. running mean: 10.891672145642694
resetting env. episode reward total was 14.0. running mean: 10.922755424186267
resetting env. episode reward total was 16.0. running mean: 10.973527869944405
resetting env. episode reward total was 13.0. running mean: 10.993792591244961
resetting env. episode reward total was -5.0. running mean: 10.833854665332511
resetting env. episode reward total was 12.0. running mean: 10.845516118679186
resetting env. episode reward total was 9.0. running mean: 10.827060957492394
resetting env. episode reward total was 9.0. running mean: 

resetting env. episode reward total was 12.0. running mean: 11.52507269001784
resetting env. episode reward total was 10.0. running mean: 11.50982196311766
resetting env. episode reward total was 17.0. running mean: 11.564723743486484
resetting env. episode reward total was 15.0. running mean: 11.599076506051619
resetting env. episode reward total was 16.0. running mean: 11.643085740991102
resetting env. episode reward total was 3.0. running mean: 11.55665488358119
resetting env. episode reward total was 8.0. running mean: 11.521088334745379
resetting env. episode reward total was 19.0. running mean: 11.595877451397925
resetting env. episode reward total was 12.0. running mean: 11.599918676883945
resetting env. episode reward total was 9.0. running mean: 11.573919490115106
resetting env. episode reward total was 11.0. running mean: 11.568180295213955
resetting env. episode reward total was 13.0. running mean: 11.582498492261816
resetting env. episode reward total was 10.0. running mean

resetting env. episode reward total was 16.0. running mean: 11.625146011705418
resetting env. episode reward total was 17.0. running mean: 11.678894551588364
resetting env. episode reward total was 11.0. running mean: 11.67210560607248
resetting env. episode reward total was 15.0. running mean: 11.705384550011756
resetting env. episode reward total was 5.0. running mean: 11.638330704511638
resetting env. episode reward total was 10.0. running mean: 11.62194739746652
resetting env. episode reward total was 12.0. running mean: 11.625727923491855
resetting env. episode reward total was 9.0. running mean: 11.599470644256936
resetting env. episode reward total was 7.0. running mean: 11.553475937814367
resetting env. episode reward total was 13.0. running mean: 11.567941178436223
resetting env. episode reward total was 16.0. running mean: 11.61226176665186
resetting env. episode reward total was 19.0. running mean: 11.68613914898534
resetting env. episode reward total was 17.0. running mean:

resetting env. episode reward total was 13.0. running mean: 11.549949325916304
resetting env. episode reward total was 5.0. running mean: 11.484449832657141
resetting env. episode reward total was 11.0. running mean: 11.47960533433057
resetting env. episode reward total was 8.0. running mean: 11.444809280987265
resetting env. episode reward total was 10.0. running mean: 11.430361188177391
resetting env. episode reward total was 12.0. running mean: 11.436057576295617
resetting env. episode reward total was 16.0. running mean: 11.48169700053266
resetting env. episode reward total was 5.0. running mean: 11.416880030527334
resetting env. episode reward total was 17.0. running mean: 11.47271123022206
resetting env. episode reward total was 15.0. running mean: 11.50798411791984
resetting env. episode reward total was 7.0. running mean: 11.462904276740641
resetting env. episode reward total was 13.0. running mean: 11.478275233973235
resetting env. episode reward total was 11.0. running mean: 

resetting env. episode reward total was 14.0. running mean: 11.247512149190605
resetting env. episode reward total was 11.0. running mean: 11.245037027698698
resetting env. episode reward total was 7.0. running mean: 11.202586657421712
resetting env. episode reward total was 10.0. running mean: 11.190560790847494
resetting env. episode reward total was 6.0. running mean: 11.138655182939019
resetting env. episode reward total was 5.0. running mean: 11.077268631109629
resetting env. episode reward total was 11.0. running mean: 11.076495944798532
resetting env. episode reward total was 15.0. running mean: 11.115730985350547
resetting env. episode reward total was 14.0. running mean: 11.144573675497043
resetting env. episode reward total was 9.0. running mean: 11.123127938742073
resetting env. episode reward total was 12.0. running mean: 11.131896659354652
resetting env. episode reward total was 6.0. running mean: 11.080577692761105
resetting env. episode reward total was 14.0. running mea

resetting env. episode reward total was 7.0. running mean: 10.913644393579139
resetting env. episode reward total was 13.0. running mean: 10.934507949643349
resetting env. episode reward total was 3.0. running mean: 10.855162870146914
resetting env. episode reward total was 15.0. running mean: 10.896611241445445
resetting env. episode reward total was 15.0. running mean: 10.93764512903099
resetting env. episode reward total was 12.0. running mean: 10.94826867774068
resetting env. episode reward total was 7.0. running mean: 10.908785990963274
resetting env. episode reward total was 7.0. running mean: 10.869698131053642
resetting env. episode reward total was 17.0. running mean: 10.931001149743105
resetting env. episode reward total was 15.0. running mean: 10.971691138245674
resetting env. episode reward total was 11.0. running mean: 10.971974226863217
resetting env. episode reward total was 18.0. running mean: 11.042254484594585
resetting env. episode reward total was 12.0. running mean

resetting env. episode reward total was 13.0. running mean: 11.20905684592499
resetting env. episode reward total was 9.0. running mean: 11.186966277465741
resetting env. episode reward total was 16.0. running mean: 11.235096614691084
resetting env. episode reward total was 8.0. running mean: 11.202745648544173
resetting env. episode reward total was 12.0. running mean: 11.21071819205873
resetting env. episode reward total was 16.0. running mean: 11.258611010138143
resetting env. episode reward total was 11.0. running mean: 11.256024900036762
resetting env. episode reward total was 12.0. running mean: 11.263464651036394
resetting env. episode reward total was 16.0. running mean: 11.31083000452603
resetting env. episode reward total was 15.0. running mean: 11.34772170448077
resetting env. episode reward total was 11.0. running mean: 11.344244487435962
resetting env. episode reward total was 6.0. running mean: 11.290802042561602
resetting env. episode reward total was 14.0. running mean:

resetting env. episode reward total was 12.0. running mean: 11.55095093644625
resetting env. episode reward total was 11.0. running mean: 11.545441427081787
resetting env. episode reward total was 13.0. running mean: 11.55998701281097
resetting env. episode reward total was 9.0. running mean: 11.53438714268286
resetting env. episode reward total was 4.0. running mean: 11.45904327125603
resetting env. episode reward total was 14.0. running mean: 11.48445283854347
resetting env. episode reward total was 9.0. running mean: 11.459608310158035
resetting env. episode reward total was 13.0. running mean: 11.475012227056455
resetting env. episode reward total was 10.0. running mean: 11.46026210478589
resetting env. episode reward total was 6.0. running mean: 11.405659483738033
resetting env. episode reward total was 14.0. running mean: 11.431602888900652
resetting env. episode reward total was 9.0. running mean: 11.407286860011645
resetting env. episode reward total was 13.0. running mean: 11.

resetting env. episode reward total was 6.0. running mean: 11.128676726447308
resetting env. episode reward total was 17.0. running mean: 11.187389959182834
resetting env. episode reward total was 15.0. running mean: 11.225516059591007
resetting env. episode reward total was 15.0. running mean: 11.263260898995096
resetting env. episode reward total was 10.0. running mean: 11.250628290005144
resetting env. episode reward total was 16.0. running mean: 11.298122007105093
resetting env. episode reward total was 5.0. running mean: 11.235140787034043
resetting env. episode reward total was 8.0. running mean: 11.202789379163702
resetting env. episode reward total was 13.0. running mean: 11.220761485372066
resetting env. episode reward total was 10.0. running mean: 11.208553870518344
resetting env. episode reward total was 15.0. running mean: 11.24646833181316
resetting env. episode reward total was 14.0. running mean: 11.274003648495029
resetting env. episode reward total was 13.0. running me

resetting env. episode reward total was 15.0. running mean: 11.499439517119987
resetting env. episode reward total was 11.0. running mean: 11.494445121948786
resetting env. episode reward total was 8.0. running mean: 11.459500670729298
resetting env. episode reward total was 13.0. running mean: 11.474905664022005
resetting env. episode reward total was 10.0. running mean: 11.460156607381785
resetting env. episode reward total was 15.0. running mean: 11.495555041307968
resetting env. episode reward total was 16.0. running mean: 11.540599490894888
resetting env. episode reward total was 17.0. running mean: 11.595193495985939
resetting env. episode reward total was 1.0. running mean: 11.489241561026079
resetting env. episode reward total was -1.0. running mean: 11.364349145415819
resetting env. episode reward total was 11.0. running mean: 11.36070565396166
resetting env. episode reward total was 9.0. running mean: 11.337098597422042
resetting env. episode reward total was 4.0. running mea

resetting env. episode reward total was 9.0. running mean: 11.596068756956132
resetting env. episode reward total was 14.0. running mean: 11.620108069386571
resetting env. episode reward total was 11.0. running mean: 11.613906988692705
resetting env. episode reward total was 12.0. running mean: 11.617767918805777
resetting env. episode reward total was 16.0. running mean: 11.66159023961772
resetting env. episode reward total was 12.0. running mean: 11.664974337221542
resetting env. episode reward total was 12.0. running mean: 11.668324593849325
resetting env. episode reward total was 16.0. running mean: 11.711641347910833
resetting env. episode reward total was 15.0. running mean: 11.744524934431725
resetting env. episode reward total was 10.0. running mean: 11.727079685087407
resetting env. episode reward total was 7.0. running mean: 11.679808888236533
resetting env. episode reward total was 14.0. running mean: 11.703010799354168
resetting env. episode reward total was 14.0. running m

resetting env. episode reward total was 17.0. running mean: 12.004417975214722
resetting env. episode reward total was 13.0. running mean: 12.014373795462575
resetting env. episode reward total was 10.0. running mean: 11.994230057507949
resetting env. episode reward total was 9.0. running mean: 11.964287756932869
resetting env. episode reward total was 15.0. running mean: 11.994644879363541
resetting env. episode reward total was 18.0. running mean: 12.054698430569905
resetting env. episode reward total was 6.0. running mean: 11.994151446264206
resetting env. episode reward total was 6.0. running mean: 11.934209931801565
resetting env. episode reward total was 8.0. running mean: 11.894867832483548
resetting env. episode reward total was 12.0. running mean: 11.895919154158712
resetting env. episode reward total was 12.0. running mean: 11.896959962617125
resetting env. episode reward total was 14.0. running mean: 11.917990362990954
resetting env. episode reward total was 6.0. running mea

resetting env. episode reward total was 6.0. running mean: 11.09328521030672
resetting env. episode reward total was 10.0. running mean: 11.082352358203652
resetting env. episode reward total was 7.0. running mean: 11.041528834621616
resetting env. episode reward total was 17.0. running mean: 11.1011135462754
resetting env. episode reward total was 16.0. running mean: 11.150102410812647
resetting env. episode reward total was 4.0. running mean: 11.07860138670452
resetting env. episode reward total was 12.0. running mean: 11.087815372837474
resetting env. episode reward total was 12.0. running mean: 11.096937219109098
resetting env. episode reward total was 14.0. running mean: 11.125967846918007
resetting env. episode reward total was 5.0. running mean: 11.064708168448828
resetting env. episode reward total was 18.0. running mean: 11.134061086764339
resetting env. episode reward total was 12.0. running mean: 11.142720475896695
resetting env. episode reward total was 11.0. running mean: 

resetting env. episode reward total was 7.0. running mean: 11.408729136585581
resetting env. episode reward total was 12.0. running mean: 11.414641845219725
resetting env. episode reward total was 13.0. running mean: 11.430495426767528
resetting env. episode reward total was 10.0. running mean: 11.416190472499853
resetting env. episode reward total was 15.0. running mean: 11.452028567774855
resetting env. episode reward total was 14.0. running mean: 11.477508282097107
resetting env. episode reward total was 14.0. running mean: 11.502733199276136
resetting env. episode reward total was 17.0. running mean: 11.557705867283374
resetting env. episode reward total was 7.0. running mean: 11.51212880861054
resetting env. episode reward total was 15.0. running mean: 11.547007520524435
resetting env. episode reward total was 3.0. running mean: 11.46153744531919
resetting env. episode reward total was 14.0. running mean: 11.486922070865997
resetting env. episode reward total was 14.0. running mea

resetting env. episode reward total was 11.0. running mean: 11.551022738726573
resetting env. episode reward total was 16.0. running mean: 11.595512511339308
resetting env. episode reward total was 11.0. running mean: 11.589557386225914
resetting env. episode reward total was 8.0. running mean: 11.553661812363655
resetting env. episode reward total was 14.0. running mean: 11.57812519424002
resetting env. episode reward total was 10.0. running mean: 11.56234394229762
resetting env. episode reward total was 12.0. running mean: 11.566720502874643
resetting env. episode reward total was 11.0. running mean: 11.561053297845897
resetting env. episode reward total was 17.0. running mean: 11.615442764867437
resetting env. episode reward total was 10.0. running mean: 11.599288337218763
resetting env. episode reward total was 14.0. running mean: 11.623295453846575
resetting env. episode reward total was 13.0. running mean: 11.63706249930811
resetting env. episode reward total was 12.0. running me

resetting env. episode reward total was 17.0. running mean: 11.519107495928582
resetting env. episode reward total was 5.0. running mean: 11.453916420969296
resetting env. episode reward total was 10.0. running mean: 11.439377256759602
resetting env. episode reward total was 14.0. running mean: 11.464983484192008
resetting env. episode reward total was 9.0. running mean: 11.440333649350087
resetting env. episode reward total was 18.0. running mean: 11.505930312856586
resetting env. episode reward total was 9.0. running mean: 11.48087100972802
resetting env. episode reward total was 16.0. running mean: 11.52606229963074
resetting env. episode reward total was 12.0. running mean: 11.53080167663443
resetting env. episode reward total was 8.0. running mean: 11.495493659868087
resetting env. episode reward total was 6.0. running mean: 11.440538723269407
resetting env. episode reward total was 17.0. running mean: 11.496133336036714
resetting env. episode reward total was 18.0. running mean: 

resetting env. episode reward total was 14.0. running mean: 11.916396506037158
resetting env. episode reward total was 15.0. running mean: 11.947232540976787
resetting env. episode reward total was 13.0. running mean: 11.957760215567019
resetting env. episode reward total was 9.0. running mean: 11.928182613411348
resetting env. episode reward total was 8.0. running mean: 11.888900787277235
resetting env. episode reward total was 13.0. running mean: 11.900011779404464
resetting env. episode reward total was 10.0. running mean: 11.881011661610419
resetting env. episode reward total was 10.0. running mean: 11.862201544994313
resetting env. episode reward total was 5.0. running mean: 11.79357952954437
resetting env. episode reward total was 7.0. running mean: 11.745643734248928
resetting env. episode reward total was 9.0. running mean: 11.718187296906438
resetting env. episode reward total was 6.0. running mean: 11.661005423937373
resetting env. episode reward total was 12.0. running mean:

resetting env. episode reward total was 11.0. running mean: 12.449254047057206
resetting env. episode reward total was 13.0. running mean: 12.454761506586635
resetting env. episode reward total was 12.0. running mean: 12.450213891520768
resetting env. episode reward total was 11.0. running mean: 12.43571175260556
resetting env. episode reward total was 16.0. running mean: 12.471354635079503
resetting env. episode reward total was 16.0. running mean: 12.506641088728708
resetting env. episode reward total was 17.0. running mean: 12.55157467784142
resetting env. episode reward total was 15.0. running mean: 12.576058931063006
resetting env. episode reward total was 15.0. running mean: 12.600298341752376
resetting env. episode reward total was 9.0. running mean: 12.564295358334853
resetting env. episode reward total was 12.0. running mean: 12.558652404751504
resetting env. episode reward total was 10.0. running mean: 12.533065880703989
resetting env. episode reward total was 13.0. running m

resetting env. episode reward total was 13.0. running mean: 12.314299814169111
resetting env. episode reward total was 14.0. running mean: 12.33115681602742
resetting env. episode reward total was 15.0. running mean: 12.357845247867147
resetting env. episode reward total was 10.0. running mean: 12.334266795388475
resetting env. episode reward total was 11.0. running mean: 12.320924127434589
resetting env. episode reward total was 14.0. running mean: 12.337714886160244
resetting env. episode reward total was 15.0. running mean: 12.364337737298642
resetting env. episode reward total was 11.0. running mean: 12.350694359925656
resetting env. episode reward total was 12.0. running mean: 12.347187416326399
resetting env. episode reward total was 14.0. running mean: 12.363715542163135
resetting env. episode reward total was 6.0. running mean: 12.300078386741504
resetting env. episode reward total was 16.0. running mean: 12.33707760287409
resetting env. episode reward total was 9.0. running me

resetting env. episode reward total was 12.0. running mean: 12.317286482065626
resetting env. episode reward total was 14.0. running mean: 12.33411361724497
resetting env. episode reward total was 15.0. running mean: 12.360772481072521
resetting env. episode reward total was 13.0. running mean: 12.367164756261797
resetting env. episode reward total was 12.0. running mean: 12.363493108699178
resetting env. episode reward total was 14.0. running mean: 12.379858177612187
resetting env. episode reward total was 14.0. running mean: 12.396059595836066
resetting env. episode reward total was 9.0. running mean: 12.362098999877706
resetting env. episode reward total was 13.0. running mean: 12.36847800987893
resetting env. episode reward total was 11.0. running mean: 12.354793229780139
resetting env. episode reward total was 9.0. running mean: 12.321245297482337
resetting env. episode reward total was 18.0. running mean: 12.378032844507514
resetting env. episode reward total was 13.0. running me

resetting env. episode reward total was 2.0. running mean: 12.3567550176788
resetting env. episode reward total was 15.0. running mean: 12.383187467502012
resetting env. episode reward total was 15.0. running mean: 12.409355592826993
resetting env. episode reward total was 2.0. running mean: 12.305262036898721
resetting env. episode reward total was 9.0. running mean: 12.272209416529734
resetting env. episode reward total was 16.0. running mean: 12.309487322364436
resetting env. episode reward total was 13.0. running mean: 12.316392449140793
resetting env. episode reward total was 5.0. running mean: 12.243228524649385
resetting env. episode reward total was 9.0. running mean: 12.21079623940289
resetting env. episode reward total was 15.0. running mean: 12.238688277008862
resetting env. episode reward total was 13.0. running mean: 12.246301394238774
resetting env. episode reward total was 17.0. running mean: 12.293838380296386
resetting env. episode reward total was 13.0. running mean: 

resetting env. episode reward total was 11.0. running mean: 12.564050620397637
resetting env. episode reward total was 10.0. running mean: 12.53841011419366
resetting env. episode reward total was 11.0. running mean: 12.523026013051723
resetting env. episode reward total was 16.0. running mean: 12.557795752921205
resetting env. episode reward total was 13.0. running mean: 12.562217795391994
resetting env. episode reward total was 15.0. running mean: 12.586595617438075
resetting env. episode reward total was -1.0. running mean: 12.450729661263694
resetting env. episode reward total was 20.0. running mean: 12.526222364651055
resetting env. episode reward total was 12.0. running mean: 12.520960141004544
resetting env. episode reward total was 10.0. running mean: 12.495750539594498
resetting env. episode reward total was 18.0. running mean: 12.550793034198552
resetting env. episode reward total was 13.0. running mean: 12.555285103856567
resetting env. episode reward total was 8.0. running 

resetting env. episode reward total was 13.0. running mean: 12.811998141843484
resetting env. episode reward total was 14.0. running mean: 12.82387816042505
resetting env. episode reward total was 15.0. running mean: 12.845639378820799
resetting env. episode reward total was 16.0. running mean: 12.877182985032592
resetting env. episode reward total was 15.0. running mean: 12.898411155182266
resetting env. episode reward total was 13.0. running mean: 12.899427043630444
resetting env. episode reward total was 13.0. running mean: 12.90043277319414
resetting env. episode reward total was 12.0. running mean: 12.891428445462198
resetting env. episode reward total was 10.0. running mean: 12.862514161007576
resetting env. episode reward total was 16.0. running mean: 12.8938890193975
resetting env. episode reward total was 9.0. running mean: 12.854950129203525
resetting env. episode reward total was 14.0. running mean: 12.86640062791149
resetting env. episode reward total was 8.0. running mean:

resetting env. episode reward total was 15.0. running mean: 12.493978029067616
resetting env. episode reward total was 6.0. running mean: 12.42903824877694
resetting env. episode reward total was 6.0. running mean: 12.364747866289171
resetting env. episode reward total was 15.0. running mean: 12.39110038762628
resetting env. episode reward total was 16.0. running mean: 12.427189383750017
resetting env. episode reward total was 15.0. running mean: 12.452917489912517
resetting env. episode reward total was 8.0. running mean: 12.408388315013392
resetting env. episode reward total was 15.0. running mean: 12.434304431863259
resetting env. episode reward total was 12.0. running mean: 12.429961387544626
resetting env. episode reward total was 14.0. running mean: 12.44566177366918
resetting env. episode reward total was 15.0. running mean: 12.471205155932488
resetting env. episode reward total was 17.0. running mean: 12.516493104373163
resetting env. episode reward total was 12.0. running mean

resetting env. episode reward total was 8.0. running mean: 12.38141704218858
resetting env. episode reward total was 8.0. running mean: 12.337602871766695
resetting env. episode reward total was 14.0. running mean: 12.354226843049029
resetting env. episode reward total was 11.0. running mean: 12.340684574618537
resetting env. episode reward total was 15.0. running mean: 12.367277728872352
resetting env. episode reward total was 14.0. running mean: 12.38360495158363
resetting env. episode reward total was 13.0. running mean: 12.389768902067795
resetting env. episode reward total was 11.0. running mean: 12.375871213047116
resetting env. episode reward total was 14.0. running mean: 12.392112500916646
resetting env. episode reward total was 9.0. running mean: 12.358191375907479
resetting env. episode reward total was 15.0. running mean: 12.384609462148404
resetting env. episode reward total was 9.0. running mean: 12.35076336752692
resetting env. episode reward total was 15.0. running mean:

resetting env. episode reward total was 13.0. running mean: 12.066542684039712
resetting env. episode reward total was 16.0. running mean: 12.105877257199316
resetting env. episode reward total was 11.0. running mean: 12.094818484627321
resetting env. episode reward total was 9.0. running mean: 12.063870299781048
resetting env. episode reward total was 12.0. running mean: 12.063231596783236
resetting env. episode reward total was 18.0. running mean: 12.122599280815404
resetting env. episode reward total was 15.0. running mean: 12.15137328800725
resetting env. episode reward total was 16.0. running mean: 12.189859555127176
resetting env. episode reward total was 12.0. running mean: 12.187960959575904
resetting env. episode reward total was 17.0. running mean: 12.236081349980145
resetting env. episode reward total was 13.0. running mean: 12.243720536480344
resetting env. episode reward total was 8.0. running mean: 12.20128333111554
resetting env. episode reward total was 7.0. running mea

resetting env. episode reward total was 15.0. running mean: 12.089937153985723
resetting env. episode reward total was 12.0. running mean: 12.089037782445864
resetting env. episode reward total was 8.0. running mean: 12.048147404621405
resetting env. episode reward total was 13.0. running mean: 12.057665930575192
resetting env. episode reward total was 12.0. running mean: 12.057089271269438
resetting env. episode reward total was 17.0. running mean: 12.106518378556745
resetting env. episode reward total was 19.0. running mean: 12.175453194771176
resetting env. episode reward total was 11.0. running mean: 12.163698662823464
resetting env. episode reward total was 7.0. running mean: 12.112061676195228
resetting env. episode reward total was 8.0. running mean: 12.070941059433276
resetting env. episode reward total was 11.0. running mean: 12.060231648838942
resetting env. episode reward total was 11.0. running mean: 12.049629332350552
resetting env. episode reward total was 9.0. running me

resetting env. episode reward total was 13.0. running mean: 12.115076580158163
resetting env. episode reward total was 13.0. running mean: 12.123925814356582
resetting env. episode reward total was 13.0. running mean: 12.132686556213017
resetting env. episode reward total was 11.0. running mean: 12.121359690650886
resetting env. episode reward total was 11.0. running mean: 12.110146093744376
resetting env. episode reward total was 12.0. running mean: 12.109044632806931
resetting env. episode reward total was 10.0. running mean: 12.08795418647886
resetting env. episode reward total was 20.0. running mean: 12.167074644614072
resetting env. episode reward total was 14.0. running mean: 12.185403898167932
resetting env. episode reward total was 7.0. running mean: 12.133549859186253
resetting env. episode reward total was 12.0. running mean: 12.13221436059439
resetting env. episode reward total was 11.0. running mean: 12.120892216988445
resetting env. episode reward total was 7.0. running me

resetting env. episode reward total was 10.0. running mean: 12.822222215503205
resetting env. episode reward total was 16.0. running mean: 12.853999993348173
resetting env. episode reward total was 13.0. running mean: 12.855459993414692
resetting env. episode reward total was 17.0. running mean: 12.896905393480546
resetting env. episode reward total was 14.0. running mean: 12.907936339545742
resetting env. episode reward total was 11.0. running mean: 12.888856976150285
resetting env. episode reward total was 15.0. running mean: 12.909968406388781
resetting env. episode reward total was 8.0. running mean: 12.860868722324893
resetting env. episode reward total was 7.0. running mean: 12.802260035101645
resetting env. episode reward total was 10.0. running mean: 12.774237434750628
resetting env. episode reward total was 9.0. running mean: 12.73649506040312
resetting env. episode reward total was 5.0. running mean: 12.65913010979909
resetting env. episode reward total was 17.0. running mean

resetting env. episode reward total was 13.0. running mean: 12.056393939141566
resetting env. episode reward total was -6.0. running mean: 11.87582999975015
resetting env. episode reward total was 14.0. running mean: 11.897071699752649
resetting env. episode reward total was 2.0. running mean: 11.798100982755122
resetting env. episode reward total was 11.0. running mean: 11.79011997292757
resetting env. episode reward total was 19.0. running mean: 11.862218773198293
resetting env. episode reward total was 9.0. running mean: 11.83359658546631
resetting env. episode reward total was 14.0. running mean: 11.855260619611647
resetting env. episode reward total was 8.0. running mean: 11.81670801341553
resetting env. episode reward total was 12.0. running mean: 11.818540933281374
resetting env. episode reward total was 12.0. running mean: 11.82035552394856
resetting env. episode reward total was 9.0. running mean: 11.792151968709074
resetting env. episode reward total was 3.0. running mean: 11

resetting env. episode reward total was 15.0. running mean: 12.6628694669712
resetting env. episode reward total was 8.0. running mean: 12.616240772301488
resetting env. episode reward total was 18.0. running mean: 12.670078364578472
resetting env. episode reward total was 16.0. running mean: 12.703377580932688
resetting env. episode reward total was 14.0. running mean: 12.716343805123362
resetting env. episode reward total was 16.0. running mean: 12.749180367072128
resetting env. episode reward total was 10.0. running mean: 12.721688563401406
resetting env. episode reward total was 13.0. running mean: 12.724471677767392
resetting env. episode reward total was 14.0. running mean: 12.737226960989718
resetting env. episode reward total was 17.0. running mean: 12.779854691379821
resetting env. episode reward total was 10.0. running mean: 12.752056144466023
resetting env. episode reward total was 15.0. running mean: 12.774535583021363
resetting env. episode reward total was 18.0. running m

resetting env. episode reward total was 11.0. running mean: 12.757086672331553
resetting env. episode reward total was 8.0. running mean: 12.709515805608238
resetting env. episode reward total was 14.0. running mean: 12.722420647552156
resetting env. episode reward total was 12.0. running mean: 12.715196441076634
resetting env. episode reward total was 11.0. running mean: 12.698044476665867
resetting env. episode reward total was 12.0. running mean: 12.691064031899208
resetting env. episode reward total was 8.0. running mean: 12.644153391580215
resetting env. episode reward total was 16.0. running mean: 12.677711857664413
resetting env. episode reward total was 9.0. running mean: 12.640934739087768
resetting env. episode reward total was 7.0. running mean: 12.584525391696891
resetting env. episode reward total was 16.0. running mean: 12.618680137779922
resetting env. episode reward total was 15.0. running mean: 12.642493336402124
resetting env. episode reward total was 3.0. running mea

resetting env. episode reward total was 15.0. running mean: 12.552601482738153
resetting env. episode reward total was 12.0. running mean: 12.547075467910771
resetting env. episode reward total was 13.0. running mean: 12.551604713231663
resetting env. episode reward total was 20.0. running mean: 12.626088666099346
resetting env. episode reward total was 9.0. running mean: 12.589827779438352
resetting env. episode reward total was 19.0. running mean: 12.653929501643967
resetting env. episode reward total was 15.0. running mean: 12.677390206627528
resetting env. episode reward total was 11.0. running mean: 12.660616304561252
resetting env. episode reward total was 15.0. running mean: 12.684010141515639
resetting env. episode reward total was 16.0. running mean: 12.717170040100482
resetting env. episode reward total was 10.0. running mean: 12.689998339699477
resetting env. episode reward total was 9.0. running mean: 12.65309835630248
resetting env. episode reward total was 15.0. running m

resetting env. episode reward total was 16.0. running mean: 12.844840393945711
resetting env. episode reward total was 13.0. running mean: 12.846391990006255
resetting env. episode reward total was 12.0. running mean: 12.837928070106193
resetting env. episode reward total was 14.0. running mean: 12.849548789405132
resetting env. episode reward total was 14.0. running mean: 12.861053301511081
resetting env. episode reward total was 6.0. running mean: 12.79244276849597
resetting env. episode reward total was 10.0. running mean: 12.76451834081101
resetting env. episode reward total was 6.0. running mean: 12.696873157402901
resetting env. episode reward total was 17.0. running mean: 12.739904425828872
resetting env. episode reward total was 14.0. running mean: 12.752505381570584
resetting env. episode reward total was 8.0. running mean: 12.704980327754878
resetting env. episode reward total was 6.0. running mean: 12.63793052447733
resetting env. episode reward total was 14.0. running mean:

resetting env. episode reward total was 7.0. running mean: 12.313350017472787
resetting env. episode reward total was 11.0. running mean: 12.30021651729806
resetting env. episode reward total was 8.0. running mean: 12.257214352125079
resetting env. episode reward total was 10.0. running mean: 12.234642208603827
resetting env. episode reward total was 14.0. running mean: 12.25229578651779
resetting env. episode reward total was 13.0. running mean: 12.259772828652613
resetting env. episode reward total was 10.0. running mean: 12.237175100366086
resetting env. episode reward total was 12.0. running mean: 12.234803349362425
resetting env. episode reward total was 7.0. running mean: 12.1824553158688
resetting env. episode reward total was 9.0. running mean: 12.150630762710112
resetting env. episode reward total was 5.0. running mean: 12.079124455083011
resetting env. episode reward total was 10.0. running mean: 12.058333210532181
resetting env. episode reward total was 1.0. running mean: 11

resetting env. episode reward total was 12.0. running mean: 11.427280010326776
resetting env. episode reward total was 17.0. running mean: 11.483007210223507
resetting env. episode reward total was 6.0. running mean: 11.428177138121272
resetting env. episode reward total was 10.0. running mean: 11.413895366740059
resetting env. episode reward total was 12.0. running mean: 11.419756413072657
resetting env. episode reward total was 13.0. running mean: 11.43555884894193
resetting env. episode reward total was 10.0. running mean: 11.421203260452511
resetting env. episode reward total was 11.0. running mean: 11.416991227847985
resetting env. episode reward total was 14.0. running mean: 11.442821315569505
resetting env. episode reward total was 12.0. running mean: 11.448393102413808
resetting env. episode reward total was 13.0. running mean: 11.463909171389671
resetting env. episode reward total was 14.0. running mean: 11.489270079675775
resetting env. episode reward total was 12.0. running 

resetting env. episode reward total was 17.0. running mean: 12.182571843887617
resetting env. episode reward total was 7.0. running mean: 12.130746125448741
resetting env. episode reward total was 14.0. running mean: 12.149438664194255
resetting env. episode reward total was 3.0. running mean: 12.057944277552313
resetting env. episode reward total was 8.0. running mean: 12.017364834776789
resetting env. episode reward total was 14.0. running mean: 12.037191186429022
resetting env. episode reward total was 12.0. running mean: 12.03681927456473
resetting env. episode reward total was 12.0. running mean: 12.03645108181908
resetting env. episode reward total was 16.0. running mean: 12.07608657100089
resetting env. episode reward total was 15.0. running mean: 12.10532570529088
resetting env. episode reward total was 12.0. running mean: 12.104272448237971
resetting env. episode reward total was 12.0. running mean: 12.103229723755591
resetting env. episode reward total was 7.0. running mean: 

resetting env. episode reward total was 16.0. running mean: 11.976964940251493
resetting env. episode reward total was 14.0. running mean: 11.997195290848978
resetting env. episode reward total was 18.0. running mean: 12.057223337940489
resetting env. episode reward total was 15.0. running mean: 12.086651104561083
resetting env. episode reward total was 17.0. running mean: 12.135784593515472
resetting env. episode reward total was 12.0. running mean: 12.134426747580317
resetting env. episode reward total was 18.0. running mean: 12.193082480104513
resetting env. episode reward total was 10.0. running mean: 12.171151655303467
resetting env. episode reward total was 14.0. running mean: 12.189440138750433
resetting env. episode reward total was 15.0. running mean: 12.21754573736293
resetting env. episode reward total was 14.0. running mean: 12.2353702799893
resetting env. episode reward total was 16.0. running mean: 12.273016577189408
resetting env. episode reward total was 12.0. running m

resetting env. episode reward total was 9.0. running mean: 12.187692882127937
resetting env. episode reward total was 13.0. running mean: 12.195815953306658
resetting env. episode reward total was 9.0. running mean: 12.163857793773591
resetting env. episode reward total was 14.0. running mean: 12.182219215835856
resetting env. episode reward total was 13.0. running mean: 12.190397023677498
resetting env. episode reward total was 10.0. running mean: 12.168493053440722
resetting env. episode reward total was 11.0. running mean: 12.156808122906314
resetting env. episode reward total was 15.0. running mean: 12.185240041677252
resetting env. episode reward total was 4.0. running mean: 12.103387641260479
resetting env. episode reward total was 10.0. running mean: 12.082353764847873
resetting env. episode reward total was 15.0. running mean: 12.111530227199395
resetting env. episode reward total was 18.0. running mean: 12.170414924927401
resetting env. episode reward total was 15.0. running m

resetting env. episode reward total was 19.0. running mean: 12.6209499650351
resetting env. episode reward total was 16.0. running mean: 12.65474046538475
resetting env. episode reward total was 7.0. running mean: 12.598193060730903
resetting env. episode reward total was 12.0. running mean: 12.592211130123593
resetting env. episode reward total was 7.0. running mean: 12.536289018822357
resetting env. episode reward total was 14.0. running mean: 12.550926128634133
resetting env. episode reward total was 16.0. running mean: 12.585416867347792
resetting env. episode reward total was 14.0. running mean: 12.599562698674315
resetting env. episode reward total was 13.0. running mean: 12.603567071687573
resetting env. episode reward total was 13.0. running mean: 12.607531400970698
resetting env. episode reward total was 12.0. running mean: 12.60145608696099
resetting env. episode reward total was 15.0. running mean: 12.62544152609138
resetting env. episode reward total was 12.0. running mean:

resetting env. episode reward total was 4.0. running mean: 12.82077941257926
resetting env. episode reward total was 14.0. running mean: 12.832571618453468
resetting env. episode reward total was 12.0. running mean: 12.824245902268933
resetting env. episode reward total was 14.0. running mean: 12.836003443246243
resetting env. episode reward total was 16.0. running mean: 12.86764340881378
resetting env. episode reward total was 13.0. running mean: 12.868966974725643
resetting env. episode reward total was 12.0. running mean: 12.860277304978386
resetting env. episode reward total was 12.0. running mean: 12.851674531928602
resetting env. episode reward total was 17.0. running mean: 12.893157786609315
resetting env. episode reward total was 12.0. running mean: 12.884226208743222
resetting env. episode reward total was 13.0. running mean: 12.88538394665579
resetting env. episode reward total was 9.0. running mean: 12.846530107189231
resetting env. episode reward total was 16.0. running mea

resetting env. episode reward total was 15.0. running mean: 12.781283476029913
resetting env. episode reward total was 15.0. running mean: 12.803470641269614
resetting env. episode reward total was 17.0. running mean: 12.845435934856917
resetting env. episode reward total was 9.0. running mean: 12.806981575508347
resetting env. episode reward total was 15.0. running mean: 12.828911759753264
resetting env. episode reward total was 20.0. running mean: 12.900622642155732
resetting env. episode reward total was 14.0. running mean: 12.911616415734175
resetting env. episode reward total was 18.0. running mean: 12.962500251576833
resetting env. episode reward total was 16.0. running mean: 12.992875249061065
resetting env. episode reward total was 16.0. running mean: 13.022946496570453
resetting env. episode reward total was 10.0. running mean: 12.992717031604748
resetting env. episode reward total was 13.0. running mean: 12.992789861288701
resetting env. episode reward total was 5.0. running 

resetting env. episode reward total was 14.0. running mean: 12.752485512892298
resetting env. episode reward total was 12.0. running mean: 12.744960657763373
resetting env. episode reward total was 14.0. running mean: 12.75751105118574
resetting env. episode reward total was 8.0. running mean: 12.709935940673882
resetting env. episode reward total was -3.0. running mean: 12.552836581267144
resetting env. episode reward total was 15.0. running mean: 12.577308215454472
resetting env. episode reward total was 15.0. running mean: 12.601535133299928
resetting env. episode reward total was 5.0. running mean: 12.52551978196693
resetting env. episode reward total was 14.0. running mean: 12.54026458414726
resetting env. episode reward total was 10.0. running mean: 12.514861938305788
resetting env. episode reward total was 16.0. running mean: 12.54971331892273
resetting env. episode reward total was 11.0. running mean: 12.534216185733502
resetting env. episode reward total was 16.0. running mean

resetting env. episode reward total was 5.0. running mean: 12.527129907277107
resetting env. episode reward total was 12.0. running mean: 12.521858608204335
resetting env. episode reward total was 15.0. running mean: 12.546640022122292
resetting env. episode reward total was 13.0. running mean: 12.55117362190107
resetting env. episode reward total was 6.0. running mean: 12.48566188568206
resetting env. episode reward total was 13.0. running mean: 12.49080526682524
resetting env. episode reward total was 6.0. running mean: 12.425897214156986
resetting env. episode reward total was 14.0. running mean: 12.441638242015417
resetting env. episode reward total was 10.0. running mean: 12.417221859595262
resetting env. episode reward total was 17.0. running mean: 12.463049640999309
resetting env. episode reward total was 18.0. running mean: 12.518419144589314
resetting env. episode reward total was 14.0. running mean: 12.533234953143422
resetting env. episode reward total was 15.0. running mean

resetting env. episode reward total was 14.0. running mean: 12.391059919122055
resetting env. episode reward total was 13.0. running mean: 12.397149319930834
resetting env. episode reward total was 14.0. running mean: 12.413177826731527
resetting env. episode reward total was 10.0. running mean: 12.38904604846421
resetting env. episode reward total was 16.0. running mean: 12.42515558797957
resetting env. episode reward total was 12.0. running mean: 12.420904032099772
resetting env. episode reward total was 8.0. running mean: 12.376694991778773
resetting env. episode reward total was 13.0. running mean: 12.382928041860986
resetting env. episode reward total was 15.0. running mean: 12.409098761442376
resetting env. episode reward total was 9.0. running mean: 12.375007773827951
resetting env. episode reward total was 16.0. running mean: 12.411257696089672
resetting env. episode reward total was 13.0. running mean: 12.417145119128776
resetting env. episode reward total was 13.0. running me

resetting env. episode reward total was 12.0. running mean: 12.04564518727825
resetting env. episode reward total was 12.0. running mean: 12.045188735405468
resetting env. episode reward total was 17.0. running mean: 12.094736848051413
resetting env. episode reward total was 14.0. running mean: 12.1137894795709
resetting env. episode reward total was 10.0. running mean: 12.09265158477519
resetting env. episode reward total was 13.0. running mean: 12.101725068927438
resetting env. episode reward total was 15.0. running mean: 12.130707818238164
resetting env. episode reward total was 14.0. running mean: 12.149400740055784
resetting env. episode reward total was 17.0. running mean: 12.197906732655225
resetting env. episode reward total was 11.0. running mean: 12.185927665328672
resetting env. episode reward total was 14.0. running mean: 12.204068388675385
resetting env. episode reward total was 16.0. running mean: 12.242027704788631
resetting env. episode reward total was 11.0. running me

resetting env. episode reward total was 11.0. running mean: 12.205260160314014
resetting env. episode reward total was 13.0. running mean: 12.213207558710874
resetting env. episode reward total was 10.0. running mean: 12.191075483123765
resetting env. episode reward total was 6.0. running mean: 12.129164728292528
resetting env. episode reward total was 14.0. running mean: 12.147873081009603
resetting env. episode reward total was 16.0. running mean: 12.186394350199507
resetting env. episode reward total was 18.0. running mean: 12.24453040669751
resetting env. episode reward total was 17.0. running mean: 12.292085102630535
resetting env. episode reward total was 15.0. running mean: 12.31916425160423
resetting env. episode reward total was 17.0. running mean: 12.365972609088187
resetting env. episode reward total was 12.0. running mean: 12.362312882997305
resetting env. episode reward total was 12.0. running mean: 12.358689754167331
resetting env. episode reward total was 14.0. running m

resetting env. episode reward total was 15.0. running mean: 12.346476504218618
resetting env. episode reward total was 12.0. running mean: 12.343011739176431
resetting env. episode reward total was 7.0. running mean: 12.289581621784667
resetting env. episode reward total was 15.0. running mean: 12.31668580556682
resetting env. episode reward total was 13.0. running mean: 12.323518947511152
resetting env. episode reward total was 10.0. running mean: 12.30028375803604
resetting env. episode reward total was 17.0. running mean: 12.34728092045568
resetting env. episode reward total was 13.0. running mean: 12.353808111251123
resetting env. episode reward total was 7.0. running mean: 12.300270030138611
resetting env. episode reward total was 19.0. running mean: 12.367267329837224
resetting env. episode reward total was 16.0. running mean: 12.403594656538852
resetting env. episode reward total was 6.0. running mean: 12.339558709973463
resetting env. episode reward total was 11.0. running mean

resetting env. episode reward total was 5.0. running mean: 12.582745190735332
resetting env. episode reward total was 9.0. running mean: 12.546917738827979
resetting env. episode reward total was 9.0. running mean: 12.511448561439698
resetting env. episode reward total was 17.0. running mean: 12.556334075825301
resetting env. episode reward total was 12.0. running mean: 12.550770735067047
resetting env. episode reward total was 12.0. running mean: 12.545263027716375
resetting env. episode reward total was 11.0. running mean: 12.529810397439212
resetting env. episode reward total was 11.0. running mean: 12.514512293464819
resetting env. episode reward total was 13.0. running mean: 12.51936717053017
resetting env. episode reward total was 17.0. running mean: 12.564173498824868
resetting env. episode reward total was 11.0. running mean: 12.54853176383662
resetting env. episode reward total was 16.0. running mean: 12.583046446198253
resetting env. episode reward total was 12.0. running mea

resetting env. episode reward total was 14.0. running mean: 12.220457372419258
resetting env. episode reward total was 16.0. running mean: 12.258252798695064
resetting env. episode reward total was 13.0. running mean: 12.265670270708114
resetting env. episode reward total was 15.0. running mean: 12.293013568001033
resetting env. episode reward total was 14.0. running mean: 12.310083432321024
resetting env. episode reward total was 16.0. running mean: 12.346982597997814
resetting env. episode reward total was 8.0. running mean: 12.303512772017836
resetting env. episode reward total was 16.0. running mean: 12.340477644297657
resetting env. episode reward total was 14.0. running mean: 12.357072867854681
resetting env. episode reward total was 9.0. running mean: 12.323502139176133
resetting env. episode reward total was 12.0. running mean: 12.320267117784372
resetting env. episode reward total was 17.0. running mean: 12.367064446606529
resetting env. episode reward total was 17.0. running 

resetting env. episode reward total was 15.0. running mean: 12.241716620122915
resetting env. episode reward total was 14.0. running mean: 12.259299453921686
resetting env. episode reward total was 13.0. running mean: 12.26670645938247
resetting env. episode reward total was 9.0. running mean: 12.234039394788645
resetting env. episode reward total was 15.0. running mean: 12.261699000840759
resetting env. episode reward total was 15.0. running mean: 12.28908201083235
resetting env. episode reward total was 3.0. running mean: 12.196191190724026
resetting env. episode reward total was 11.0. running mean: 12.184229278816785
resetting env. episode reward total was 15.0. running mean: 12.212386986028617
resetting env. episode reward total was 9.0. running mean: 12.18026311616833
resetting env. episode reward total was 16.0. running mean: 12.218460485006647
resetting env. episode reward total was 13.0. running mean: 12.226275880156582
resetting env. episode reward total was 6.0. running mean:

resetting env. episode reward total was 6.0. running mean: 12.58553537515426
resetting env. episode reward total was 12.0. running mean: 12.579680021402716
resetting env. episode reward total was 11.0. running mean: 12.563883221188688
resetting env. episode reward total was 16.0. running mean: 12.5982443889768
resetting env. episode reward total was 18.0. running mean: 12.652261945087032
resetting env. episode reward total was 13.0. running mean: 12.655739325636162
resetting env. episode reward total was 14.0. running mean: 12.6691819323798
resetting env. episode reward total was 14.0. running mean: 12.682490113056003
resetting env. episode reward total was 15.0. running mean: 12.705665211925444
resetting env. episode reward total was 10.0. running mean: 12.67860855980619
resetting env. episode reward total was 14.0. running mean: 12.691822474208129
resetting env. episode reward total was 15.0. running mean: 12.714904249466048
resetting env. episode reward total was 8.0. running mean: 

resetting env. episode reward total was 19.0. running mean: 12.857771375531449
resetting env. episode reward total was 12.0. running mean: 12.849193661776134
resetting env. episode reward total was 14.0. running mean: 12.860701725158373
resetting env. episode reward total was 18.0. running mean: 12.912094707906789
resetting env. episode reward total was 14.0. running mean: 12.922973760827722
resetting env. episode reward total was 10.0. running mean: 12.893744023219444
resetting env. episode reward total was 13.0. running mean: 12.89480658298725
resetting env. episode reward total was 12.0. running mean: 12.885858517157377
resetting env. episode reward total was 15.0. running mean: 12.906999931985803
resetting env. episode reward total was 15.0. running mean: 12.927929932665945
resetting env. episode reward total was 13.0. running mean: 12.928650633339286
resetting env. episode reward total was 14.0. running mean: 12.939364127005893
resetting env. episode reward total was 13.0. running

resetting env. episode reward total was 13.0. running mean: 13.14429782262269
resetting env. episode reward total was 7.0. running mean: 13.082854844396463
resetting env. episode reward total was 10.0. running mean: 13.052026295952498
resetting env. episode reward total was 17.0. running mean: 13.091506032992973
resetting env. episode reward total was 13.0. running mean: 13.090590972663044
resetting env. episode reward total was 10.0. running mean: 13.059685062936413
resetting env. episode reward total was 16.0. running mean: 13.089088212307049
resetting env. episode reward total was 19.0. running mean: 13.148197330183978
resetting env. episode reward total was 13.0. running mean: 13.146715356882138
resetting env. episode reward total was 11.0. running mean: 13.125248203313316
resetting env. episode reward total was 11.0. running mean: 13.103995721280182
resetting env. episode reward total was 13.0. running mean: 13.10295576406738
resetting env. episode reward total was 13.0. running m

resetting env. episode reward total was 18.0. running mean: 13.018871231836306
resetting env. episode reward total was 10.0. running mean: 12.988682519517942
resetting env. episode reward total was 14.0. running mean: 12.998795694322764
resetting env. episode reward total was 15.0. running mean: 13.018807737379536
resetting env. episode reward total was 7.0. running mean: 12.958619660005741
resetting env. episode reward total was 16.0. running mean: 12.989033463405685
resetting env. episode reward total was 13.0. running mean: 12.989143128771628
resetting env. episode reward total was 14.0. running mean: 12.999251697483913
resetting env. episode reward total was 15.0. running mean: 13.019259180509074
resetting env. episode reward total was 14.0. running mean: 13.029066588703984
resetting env. episode reward total was 16.0. running mean: 13.058775922816944
resetting env. episode reward total was 9.0. running mean: 13.018188163588773
resetting env. episode reward total was 17.0. running 

resetting env. episode reward total was 14.0. running mean: 12.961015845669886
resetting env. episode reward total was 16.0. running mean: 12.991405687213186
resetting env. episode reward total was 12.0. running mean: 12.981491630341054
resetting env. episode reward total was 15.0. running mean: 13.001676714037643
resetting env. episode reward total was 15.0. running mean: 13.021659946897266
resetting env. episode reward total was 5.0. running mean: 12.941443347428294
resetting env. episode reward total was 11.0. running mean: 12.92202891395401
resetting env. episode reward total was 16.0. running mean: 12.95280862481447
resetting env. episode reward total was 9.0. running mean: 12.913280538566324
resetting env. episode reward total was 14.0. running mean: 12.92414773318066
resetting env. episode reward total was 13.0. running mean: 12.924906255848855
resetting env. episode reward total was 15.0. running mean: 12.945657193290367
resetting env. episode reward total was 9.0. running mean

resetting env. episode reward total was 15.0. running mean: 12.961580453633902
resetting env. episode reward total was 11.0. running mean: 12.941964649097562
resetting env. episode reward total was 12.0. running mean: 12.932545002606586
resetting env. episode reward total was 11.0. running mean: 12.91321955258052
resetting env. episode reward total was 13.0. running mean: 12.914087357054715
resetting env. episode reward total was 13.0. running mean: 12.914946483484169
resetting env. episode reward total was 18.0. running mean: 12.965797018649326
resetting env. episode reward total was 16.0. running mean: 12.996139048462833
resetting env. episode reward total was 17.0. running mean: 13.036177657978204
resetting env. episode reward total was 13.0. running mean: 13.035815881398422
resetting env. episode reward total was 15.0. running mean: 13.055457722584439
resetting env. episode reward total was 12.0. running mean: 13.044903145358594
resetting env. episode reward total was 16.0. running

resetting env. episode reward total was 18.0. running mean: 12.708897430227202
resetting env. episode reward total was 12.0. running mean: 12.70180845592493
resetting env. episode reward total was 17.0. running mean: 12.74479037136568
resetting env. episode reward total was 17.0. running mean: 12.787342467652023
resetting env. episode reward total was 12.0. running mean: 12.779469042975501
resetting env. episode reward total was 13.0. running mean: 12.781674352545746
resetting env. episode reward total was 14.0. running mean: 12.79385760902029
resetting env. episode reward total was 15.0. running mean: 12.815919032930086
resetting env. episode reward total was 9.0. running mean: 12.777759842600785
resetting env. episode reward total was 13.0. running mean: 12.779982244174779
resetting env. episode reward total was 14.0. running mean: 12.792182421733031
resetting env. episode reward total was 16.0. running mean: 12.824260597515702
resetting env. episode reward total was 13.0. running me

resetting env. episode reward total was 15.0. running mean: 13.345556920262261
resetting env. episode reward total was 10.0. running mean: 13.312101351059638
resetting env. episode reward total was 13.0. running mean: 13.308980337549043
resetting env. episode reward total was 11.0. running mean: 13.285890534173552
resetting env. episode reward total was 13.0. running mean: 13.283031628831816
resetting env. episode reward total was 18.0. running mean: 13.330201312543497
resetting env. episode reward total was 13.0. running mean: 13.326899299418063
resetting env. episode reward total was 12.0. running mean: 13.313630306423882
resetting env. episode reward total was 6.0. running mean: 13.240494003359643
resetting env. episode reward total was 17.0. running mean: 13.278089063326046
resetting env. episode reward total was 19.0. running mean: 13.335308172692786
resetting env. episode reward total was 12.0. running mean: 13.321955090965856
resetting env. episode reward total was 15.0. running

resetting env. episode reward total was 11.0. running mean: 13.161066236121053
resetting env. episode reward total was 13.0. running mean: 13.159455573759843
resetting env. episode reward total was 18.0. running mean: 13.207861018022244
resetting env. episode reward total was 9.0. running mean: 13.165782407842022
resetting env. episode reward total was 12.0. running mean: 13.1541245837636
resetting env. episode reward total was 12.0. running mean: 13.142583337925963
resetting env. episode reward total was 13.0. running mean: 13.141157504546705
resetting env. episode reward total was 16.0. running mean: 13.169745929501238
resetting env. episode reward total was 8.0. running mean: 13.118048470206226
resetting env. episode reward total was 11.0. running mean: 13.096867985504163
resetting env. episode reward total was 13.0. running mean: 13.095899305649123
resetting env. episode reward total was 17.0. running mean: 13.134940312592631
resetting env. episode reward total was 12.0. running me

resetting env. episode reward total was 14.0. running mean: 13.323427384024246
resetting env. episode reward total was 12.0. running mean: 13.310193110184002
resetting env. episode reward total was 16.0. running mean: 13.337091179082162
resetting env. episode reward total was 7.0. running mean: 13.27372026729134
resetting env. episode reward total was 12.0. running mean: 13.260983064618426
resetting env. episode reward total was 13.0. running mean: 13.258373233972241
resetting env. episode reward total was 16.0. running mean: 13.28578950163252
resetting env. episode reward total was 13.0. running mean: 13.282931606616195
resetting env. episode reward total was 17.0. running mean: 13.320102290550032
resetting env. episode reward total was 9.0. running mean: 13.276901267644531
resetting env. episode reward total was 13.0. running mean: 13.274132254968086
resetting env. episode reward total was 10.0. running mean: 13.241390932418405
resetting env. episode reward total was 10.0. running me

resetting env. episode reward total was 3.0. running mean: 12.809914561750082
resetting env. episode reward total was 16.0. running mean: 12.841815416132581
resetting env. episode reward total was 17.0. running mean: 12.883397261971256
resetting env. episode reward total was 16.0. running mean: 12.914563289351543
resetting env. episode reward total was 15.0. running mean: 12.935417656458029
resetting env. episode reward total was 13.0. running mean: 12.936063479893448
resetting env. episode reward total was 12.0. running mean: 12.926702845094512
resetting env. episode reward total was 13.0. running mean: 12.927435816643568
resetting env. episode reward total was 17.0. running mean: 12.968161458477132
resetting env. episode reward total was 15.0. running mean: 12.98847984389236
resetting env. episode reward total was 13.0. running mean: 12.988595045453437
resetting env. episode reward total was 16.0. running mean: 13.018709094998902
resetting env. episode reward total was 16.0. running 

resetting env. episode reward total was 19.0. running mean: 12.945056089316973
resetting env. episode reward total was 14.0. running mean: 12.955605528423803
resetting env. episode reward total was 18.0. running mean: 13.006049473139564
resetting env. episode reward total was 11.0. running mean: 12.985988978408168
resetting env. episode reward total was 7.0. running mean: 12.926129088624085
resetting env. episode reward total was 17.0. running mean: 12.966867797737844
resetting env. episode reward total was 14.0. running mean: 12.977199119760467
resetting env. episode reward total was 10.0. running mean: 12.947427128562861
resetting env. episode reward total was 15.0. running mean: 12.967952857277233
resetting env. episode reward total was 8.0. running mean: 12.91827332870446
resetting env. episode reward total was 5.0. running mean: 12.839090595417415
resetting env. episode reward total was 10.0. running mean: 12.810699689463242
resetting env. episode reward total was 15.0. running me

resetting env. episode reward total was 17.0. running mean: 12.561997580282643
resetting env. episode reward total was 7.0. running mean: 12.506377604479816
resetting env. episode reward total was 14.0. running mean: 12.521313828435018
resetting env. episode reward total was 11.0. running mean: 12.506100690150667
resetting env. episode reward total was 13.0. running mean: 12.51103968324916
resetting env. episode reward total was 11.0. running mean: 12.495929286416668
resetting env. episode reward total was 15.0. running mean: 12.520969993552502
resetting env. episode reward total was 12.0. running mean: 12.515760293616976
resetting env. episode reward total was 10.0. running mean: 12.490602690680806
resetting env. episode reward total was 10.0. running mean: 12.465696663773997
resetting env. episode reward total was 11.0. running mean: 12.451039697136256
resetting env. episode reward total was 10.0. running mean: 12.426529300164892
resetting env. episode reward total was 17.0. running 

KeyboardInterrupt: 