In [1]:
!pip uninstall -y enum34



In [2]:
!pip install gym[atari]

You should consider upgrading via the '/usr/bin/python3.6 -m pip install --upgrade pip' command.[0m


In [3]:
import torch
import torch.nn as nn
import random
import gym
import numpy as np
import random
import cv2
from tqdm import tqdm
import collections
import math

In [4]:
class ReplayBuffer():
    
    def __init__(self, capacity, state_shape):
        self.capacity = capacity
        
        self.states = torch.zeros(capacity, *state_shape)
        self.actions = torch.zeros(capacity, 1)
        self.rewards = torch.zeros(capacity, 1)
        self.next_states = torch.zeros(capacity, *state_shape)
        self.done = torch.zeros(capacity, 1)
        self.position = 0
        self.length = 0
        
        
    def push(self, state, action, reward, next_state, done):    
        self.states[self.position] = state
        self.actions[self.position] = action
        self.rewards[self.position] = reward
        self.next_states[self.position] = next_state
        self.done[self.position] = done
        
        self.position = (self.position + 1) % self.capacity
        self.length = min(self.length + 1, self.capacity)
        
    def sample(self, batch_size):
        indices = np.random.choice(range(self.length), size=batch_size)
        state_sample = self.states[indices]
        action_sample = self.actions[indices]
        reward_sample = self.rewards[indices]
        next_state_sample = self.next_states[indices]
        done_sample = self.done[indices]
        
        return state_sample, action_sample, reward_sample, next_state_sample, done_sample
        
    def __len__(self):
        return self.length

In [5]:
class MaxAndSkipEnv(gym.Wrapper):
    def __init__(self, env=None, skip=4):
        """Return only every `skip`-th frame"""
        super(MaxAndSkipEnv, self).__init__(env)
        # most recent raw observations (for max pooling across time steps)
        self._obs_buffer = collections.deque(maxlen=2)
        self._skip = skip

    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
            obs, reward, done, info = self.env.step(action)
            self._obs_buffer.append(obs)
            total_reward += reward
            if done:
                break
        max_frame = np.max(np.stack(self._obs_buffer), axis=0)
        return max_frame, total_reward, done, info

    def reset(self):
        """Clear past frame buffer and init to first obs"""
        self._obs_buffer.clear()
        obs = self.env.reset()
        self._obs_buffer.append(obs)
        return obs


class ProcessFrame84(gym.ObservationWrapper):
    """
    Downsamples image to 84x84
    Greyscales image

    Returns numpy array
    """
    def __init__(self, env=None):
        super(ProcessFrame84, self).__init__(env)
        self.observation_space = gym.spaces.Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)

    def observation(self, obs):
        return ProcessFrame84.process(obs)

    @staticmethod
    def process(frame):
        if frame.size == 210 * 160 * 3:
            img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
        else:
            assert False, "Unknown resolution."
        img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
        resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
        x_t = resized_screen[18:102, :]
        x_t = np.reshape(x_t, [84, 84, 1])
        return x_t.astype(np.uint8)


class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super(ImageToPyTorch, self).__init__(env)
        old_shape = self.observation_space.shape
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]),
                                                dtype=np.float32)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)


class ScaledFloatFrame(gym.ObservationWrapper):
    """Normalize pixel values in frame --> 0 to 1"""
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps, dtype=np.float32):
        super(BufferWrapper, self).__init__(env)
        self.dtype = dtype
        old_space = env.observation_space
        self.observation_space = gym.spaces.Box(old_space.low.repeat(n_steps, axis=0),
                                                old_space.high.repeat(n_steps, axis=0), dtype=dtype)

    def reset(self):
        self.buffer = np.zeros_like(self.observation_space.low, dtype=self.dtype)
        return self.observation(self.env.reset())

    def observation(self, observation):
        self.buffer[:-1] = self.buffer[1:]
        self.buffer[-1] = observation
        return self.buffer


def make_env(env):
    env = MaxAndSkipEnv(env)
    env = ProcessFrame84(env)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, 4)
    env = ScaledFloatFrame(env)
    return env

In [11]:
class DQN(nn.Module):

    def __init__(self, input_shape, n_actions):
        super(DQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU()
        )

        conv_out_size = self._get_conv_out(input_shape)
        self.fc = nn.Sequential(
            nn.Linear(conv_out_size, 512),
            nn.ReLU(),
            nn.Linear(512, n_actions)
        )
    
    def _get_conv_out(self, shape):
        o = self.conv(torch.zeros(1, *shape))
        return int(np.prod(o.size()))

    def forward(self, x):
        conv_out = self.conv(x).view(x.size()[0], -1)
        return self.fc(conv_out)
    

class DoubleDQNAgent:
    
    def __init__(self, env, memory, policy_net, target_net, batch_size=32, lr=0.00025, target_update=5000,
                 gamma=0.9, exploration_max=0.9, exploration_min=0.02, exploration_steps=10000):
        # Env
        self.env = env
        self.state_shape = env.observation_space.shape
        self.action_space = env.action_space.n
        
        # Device used for training
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        
        # Double DQN
        self.policy_net = policy_net.to(self.device)
        self.target_net = target_net.to(self.device)
        
        # DO NOT COMPUTE GRADIENTS FOR TARGET NET
        self.target_net.eval()
        
        # TODO optimizer choice
        self.lr = lr
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=self.lr)
        
        # Memory can be chose
        self.memory = memory
    
        # Learning parameters
        self.gamma = gamma
        self.loss_fn = nn.SmoothL1Loss().to(self.device) # Also known as Huber loss <-- TODO loss choice
        self.exploration_max = exploration_max
        self.exploration_rate = exploration_max
        self.exploration_min = exploration_min
        self.exploration_steps = exploration_steps
        self.target_update = target_update
        self.batch_size = batch_size
        
        # Counters and stats
        self.steps = 0
        self.episodes = 0
        self.reward_history = []
        self.loss_history = []
        
    def act(self, state):
        # Epsilon-greedy action
        self.steps += 1
        if random.random() < self.exploration_rate:  
            return torch.tensor([[random.randrange(self.action_space)]])
        # Local net is used for the policy
        return torch.argmax(self.policy_net(state.to(self.device))).unsqueeze(0).unsqueeze(0).cpu()
    
    def _update_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())
        
    def _update_exploration_rate(self):
        self.exploration_rate = self.exploration_min + (self.exploration_max - self.exploration_min) * math.exp(
            -1.0 * self.steps / self.exploration_steps
        )
    
    def optimize_model(self):
        
        # Update target DQN every `target_update` steps
        if self.steps % self.target_update == 0:
            self._update_target()
        
        # Replay buffer not full
        if self.batch_size > len(self.memory):
            return
        
        # Sample from buffer
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = self.memory.sample(self.batch_size)
        
        # Send to GPU
        state_batch = state_batch.to(self.device)
        action_batch = action_batch.to(self.device)
        reward_batch = reward_batch.to(self.device)
        next_state_batch = next_state_batch.to(self.device)
        done_batch = done_batch.to(self.device)
        
        # Zero optimizer gradients
        self.optimizer.zero_grad()
        
        # Compute target state-action values
        expected_state_action_values = reward_batch + torch.mul((self.gamma * self.target_net(next_state_batch).max(1).values.unsqueeze(1)), 1 - done_batch)
        
        # Compute state-action values from policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch.long())
        
        # Compute loss
        loss = self.loss_fn(state_action_values, expected_state_action_values)
        
        # Optimize model
        loss.backward()
        self.optimizer.step()
        
        # Decay exploration rate
        self._update_exploration_rate()
        
    def train(self, num_episodes):
        for episode in tqdm(range(num_episodes)):
            
            state = env.reset()
            state = torch.tensor(state)
            
            episode_reward = 0
            
            done = False
            while not done:
                
                self.steps += 1
                action = self.act(state.view((1,)+self.state_shape))
                
                next_state, reward, done, info = self.env.step(int(action[0]))
                
                next_state_tensor = torch.tensor(next_state)
                reward_tensor = torch.tensor(reward)
                done_tensor = torch.tensor(int(done)).unsqueeze(0)
                
                self.memory.push(state, action, reward_tensor, next_state_tensor, done_tensor)
                self.optimize_model()
                
                episode_reward += reward
                
                state = next_state_tensor
            
            self.reward_history.append(episode_reward)
            self.episodes += 1
            print("Episode reward for episode {} is {}".format(self.episodes + 1, episode_reward))
            print("Average reward is {}".format(np.mean(self.reward_history)))
            print("Epsilon is {}".format(np.mean(self.exploration_rate)))

            

In [7]:
env = gym.make('Breakout-v4')
env = make_env(env)

In [12]:
policy_net = DQN(env.observation_space.shape, env.action_space.n)
target_net = DQN(env.observation_space.shape, env.action_space.n)
memory = ReplayBuffer(30000, env.observation_space.shape)
agent = DoubleDQNAgent(env, memory, policy_net, target_net)

agent.train(1000)

  0%|          | 1/1000 [00:00<09:12,  1.81it/s]

Episode reward for episode 2 is 0.0
Average reward is 0.0
Epsilon is 0.8886340388178531


  0%|          | 2/1000 [00:01<11:35,  1.43it/s]

Episode reward for episode 3 is 1.0
Average reward is 0.5
Epsilon is 0.875017475126428


  0%|          | 3/1000 [00:02<12:26,  1.34it/s]

Episode reward for episode 4 is 2.0
Average reward is 1.0
Epsilon is 0.8631306325240032


  0%|          | 4/1000 [00:03<13:05,  1.27it/s]

Episode reward for episode 5 is 0.0
Average reward is 0.75
Epsilon is 0.8539070184482436


  0%|          | 5/1000 [00:04<13:20,  1.24it/s]

Episode reward for episode 6 is 1.0
Average reward is 0.8
Epsilon is 0.8434657084072205


  1%|          | 6/1000 [00:04<13:06,  1.26it/s]

Episode reward for episode 7 is 1.0
Average reward is 0.8333333333333334
Epsilon is 0.8336431729889855


  1%|          | 7/1000 [00:05<13:04,  1.27it/s]

Episode reward for episode 8 is 0.0
Average reward is 0.7142857142857143
Epsilon is 0.8232949105422761


  1%|          | 8/1000 [00:06<13:55,  1.19it/s]

Episode reward for episode 9 is 2.0
Average reward is 0.875
Epsilon is 0.8108607485650213


  1%|          | 9/1000 [00:07<14:50,  1.11it/s]

Episode reward for episode 10 is 3.0
Average reward is 1.1111111111111112
Epsilon is 0.7981520230535224


  1%|          | 10/1000 [00:08<16:31,  1.00s/it]

Episode reward for episode 11 is 1.0
Average reward is 1.1
Epsilon is 0.7853413224607905


  1%|          | 11/1000 [00:09<15:55,  1.04it/s]

Episode reward for episode 12 is 1.0
Average reward is 1.0909090909090908
Epsilon is 0.7743993784900827


  1%|          | 12/1000 [00:10<14:44,  1.12it/s]

Episode reward for episode 13 is 0.0
Average reward is 1.0
Epsilon is 0.765698906001113


  1%|▏         | 13/1000 [00:11<17:19,  1.05s/it]

Episode reward for episode 14 is 4.0
Average reward is 1.2307692307692308
Epsilon is 0.7500564847159423


  1%|▏         | 14/1000 [00:13<18:15,  1.11s/it]

Episode reward for episode 15 is 3.0
Average reward is 1.3571428571428572
Epsilon is 0.7360298868300518


  2%|▏         | 15/1000 [00:13<16:25,  1.00s/it]

Episode reward for episode 16 is 0.0
Average reward is 1.2666666666666666
Epsilon is 0.7279134974400191


  2%|▏         | 16/1000 [00:14<15:56,  1.03it/s]

Episode reward for episode 17 is 0.0
Average reward is 1.1875
Epsilon is 0.719609209824353


  2%|▏         | 17/1000 [00:16<17:21,  1.06s/it]

Episode reward for episode 18 is 3.0
Average reward is 1.2941176470588236
Epsilon is 0.7057560193030181


  2%|▏         | 18/1000 [00:17<16:48,  1.03s/it]

Episode reward for episode 19 is 1.0
Average reward is 1.2777777777777777
Epsilon is 0.6960870956564073


  2%|▏         | 19/1000 [00:18<17:57,  1.10s/it]

Episode reward for episode 20 is 3.0
Average reward is 1.368421052631579
Epsilon is 0.6836281034949724


  2%|▏         | 20/1000 [00:19<16:43,  1.02s/it]

Episode reward for episode 21 is 0.0
Average reward is 1.3
Epsilon is 0.6751877969700003


  2%|▏         | 21/1000 [00:20<16:55,  1.04s/it]

Episode reward for episode 22 is 0.0
Average reward is 1.2380952380952381
Epsilon is 0.6659498747283074


  2%|▏         | 22/1000 [00:21<16:41,  1.02s/it]

Episode reward for episode 23 is 1.0
Average reward is 1.2272727272727273
Epsilon is 0.6572244239849236


  2%|▏         | 23/1000 [00:22<15:53,  1.02it/s]

Episode reward for episode 24 is 0.0
Average reward is 1.173913043478261
Epsilon is 0.649245767336944


  2%|▏         | 24/1000 [00:23<16:59,  1.05s/it]

Episode reward for episode 25 is 1.0
Average reward is 1.1666666666666667
Epsilon is 0.638020672454744


  2%|▎         | 25/1000 [00:24<15:47,  1.03it/s]

Episode reward for episode 26 is 0.0
Average reward is 1.12
Epsilon is 0.6311374588716916


  3%|▎         | 26/1000 [00:25<16:18,  1.00s/it]

Episode reward for episode 27 is 1.0
Average reward is 1.1153846153846154
Epsilon is 0.6217980401068731


  3%|▎         | 27/1000 [00:26<15:30,  1.05it/s]

Episode reward for episode 28 is 0.0
Average reward is 1.0740740740740742
Epsilon is 0.6156908994638799


  3%|▎         | 28/1000 [00:27<15:59,  1.01it/s]

Episode reward for episode 29 is 1.0
Average reward is 1.0714285714285714
Epsilon is 0.6064702296512076


  3%|▎         | 29/1000 [00:28<16:50,  1.04s/it]

Episode reward for episode 30 is 1.0
Average reward is 1.0689655172413792
Epsilon is 0.5962386555709216


  3%|▎         | 30/1000 [00:29<16:05,  1.01it/s]

Episode reward for episode 31 is 1.0
Average reward is 1.0666666666666667
Epsilon is 0.5889098054831355


  3%|▎         | 31/1000 [00:29<14:31,  1.11it/s]

Episode reward for episode 32 is 0.0
Average reward is 1.032258064516129
Epsilon is 0.5833617194142495


  3%|▎         | 32/1000 [00:30<14:25,  1.12it/s]

Episode reward for episode 33 is 0.0
Average reward is 1.0
Epsilon is 0.5761966437171085


  3%|▎         | 33/1000 [00:32<16:20,  1.01s/it]

Episode reward for episode 34 is 2.0
Average reward is 1.0303030303030303
Epsilon is 0.5669305928662988


  3%|▎         | 34/1000 [00:32<15:27,  1.04it/s]

Episode reward for episode 35 is 0.0
Average reward is 1.0
Epsilon is 0.5601905282776791


  4%|▎         | 35/1000 [00:33<16:05,  1.00s/it]

Episode reward for episode 36 is 1.0
Average reward is 1.0
Epsilon is 0.5519353223773762


  4%|▎         | 36/1000 [00:34<15:07,  1.06it/s]

Episode reward for episode 37 is 0.0
Average reward is 0.9722222222222222
Epsilon is 0.5463265867752299


  4%|▎         | 37/1000 [00:36<17:26,  1.09s/it]

Episode reward for episode 38 is 3.0
Average reward is 1.027027027027027
Epsilon is 0.5356983015011874


  4%|▍         | 38/1000 [00:37<18:30,  1.15s/it]

Episode reward for episode 39 is 2.0
Average reward is 1.0526315789473684
Epsilon is 0.5263974865618911


  4%|▍         | 39/1000 [00:38<17:21,  1.08s/it]

Episode reward for episode 40 is 1.0
Average reward is 1.0512820512820513
Epsilon is 0.5195571008006467


  4%|▍         | 40/1000 [00:39<17:42,  1.11s/it]

Episode reward for episode 41 is 3.0
Average reward is 1.1
Epsilon is 0.5109399934888486


  4%|▍         | 41/1000 [00:40<16:33,  1.04s/it]

Episode reward for episode 42 is 0.0
Average reward is 1.0731707317073171
Epsilon is 0.5049869131959275


  4%|▍         | 42/1000 [00:41<16:10,  1.01s/it]

Episode reward for episode 43 is 0.0
Average reward is 1.0476190476190477
Epsilon is 0.4980531445949406


  4%|▍         | 43/1000 [00:42<15:56,  1.00it/s]

Episode reward for episode 44 is 2.0
Average reward is 1.069767441860465
Epsilon is 0.4914070319122097


  4%|▍         | 44/1000 [00:43<16:19,  1.02s/it]

Episode reward for episode 45 is 2.0
Average reward is 1.0909090909090908
Epsilon is 0.4848533165168745


  4%|▍         | 45/1000 [00:44<17:46,  1.12s/it]

Episode reward for episode 46 is 3.0
Average reward is 1.1333333333333333
Epsilon is 0.47592207532771946


  5%|▍         | 46/1000 [00:45<16:55,  1.06s/it]

Episode reward for episode 47 is 1.0
Average reward is 1.1304347826086956
Epsilon is 0.4697635082829211


  5%|▍         | 47/1000 [00:46<16:01,  1.01s/it]

Episode reward for episode 48 is 1.0
Average reward is 1.127659574468085
Epsilon is 0.46404322330152503


  5%|▍         | 48/1000 [00:48<18:00,  1.14s/it]

Episode reward for episode 49 is 3.0
Average reward is 1.1666666666666667
Epsilon is 0.454989506353425


  5%|▍         | 49/1000 [00:49<19:13,  1.21s/it]

Episode reward for episode 50 is 1.0
Average reward is 1.163265306122449
Epsilon is 0.4466320394752621


  5%|▌         | 50/1000 [00:50<19:18,  1.22s/it]

Episode reward for episode 51 is 1.0
Average reward is 1.16
Epsilon is 0.4398602456585103


  5%|▌         | 51/1000 [00:52<20:48,  1.32s/it]

Episode reward for episode 52 is 3.0
Average reward is 1.196078431372549
Epsilon is 0.43162877329135174


  5%|▌         | 52/1000 [00:53<21:36,  1.37s/it]

Episode reward for episode 53 is 2.0
Average reward is 1.2115384615384615
Epsilon is 0.42323596330009805


  5%|▌         | 53/1000 [00:55<24:19,  1.54s/it]

Episode reward for episode 54 is 5.0
Average reward is 1.2830188679245282
Epsilon is 0.41328003189287854


  5%|▌         | 54/1000 [00:56<22:35,  1.43s/it]

Episode reward for episode 55 is 2.0
Average reward is 1.2962962962962963
Epsilon is 0.4072699160729942


  6%|▌         | 55/1000 [00:57<21:06,  1.34s/it]

Episode reward for episode 56 is 1.0
Average reward is 1.290909090909091
Epsilon is 0.401351647001582


  6%|▌         | 56/1000 [00:59<20:15,  1.29s/it]

Episode reward for episode 57 is 2.0
Average reward is 1.3035714285714286
Epsilon is 0.39477352397925514


  6%|▌         | 57/1000 [01:00<20:01,  1.27s/it]

Episode reward for episode 58 is 2.0
Average reward is 1.3157894736842106
Epsilon is 0.3884562232685746


  6%|▌         | 58/1000 [01:01<17:41,  1.13s/it]

Episode reward for episode 59 is 0.0
Average reward is 1.293103448275862
Epsilon is 0.38464413576117334


  6%|▌         | 59/1000 [01:02<18:30,  1.18s/it]

Episode reward for episode 60 is 2.0
Average reward is 1.305084745762712
Epsilon is 0.37821089583802164


  6%|▌         | 60/1000 [01:03<16:34,  1.06s/it]

Episode reward for episode 61 is 0.0
Average reward is 1.2833333333333334
Epsilon is 0.3746466378715739


  6%|▌         | 61/1000 [01:05<22:26,  1.43s/it]

Episode reward for episode 62 is 4.0
Average reward is 1.3278688524590163
Epsilon is 0.36382125249002994


  6%|▌         | 62/1000 [01:06<19:14,  1.23s/it]

Episode reward for episode 63 is 0.0
Average reward is 1.3064516129032258
Epsilon is 0.36080889926201537


  6%|▋         | 63/1000 [01:07<17:37,  1.13s/it]

Episode reward for episode 64 is 1.0
Average reward is 1.3015873015873016
Epsilon is 0.3565416472744301


  6%|▋         | 64/1000 [01:07<15:47,  1.01s/it]

Episode reward for episode 65 is 1.0
Average reward is 1.296875
Epsilon is 0.3531263699967498


  6%|▋         | 65/1000 [01:09<16:20,  1.05s/it]

Episode reward for episode 66 is 3.0
Average reward is 1.323076923076923
Epsilon is 0.3477076524635532


  7%|▋         | 66/1000 [01:10<18:15,  1.17s/it]

Episode reward for episode 67 is 1.0
Average reward is 1.3181818181818181
Epsilon is 0.3410259328213267


  7%|▋         | 67/1000 [01:11<15:41,  1.01s/it]

Episode reward for episode 68 is 0.0
Average reward is 1.2985074626865671
Epsilon is 0.3381496620591659


  7%|▋         | 68/1000 [01:12<15:48,  1.02s/it]

Episode reward for episode 69 is 0.0
Average reward is 1.2794117647058822
Epsilon is 0.33391489290696563


  7%|▋         | 69/1000 [01:13<15:06,  1.03it/s]

Episode reward for episode 70 is 0.0
Average reward is 1.2608695652173914
Epsilon is 0.3302945189147689


  7%|▋         | 70/1000 [01:14<15:08,  1.02it/s]

Episode reward for episode 71 is 0.0
Average reward is 1.2428571428571429
Epsilon is 0.3265319242798812


  7%|▋         | 71/1000 [01:15<15:52,  1.02s/it]

Episode reward for episode 72 is 3.0
Average reward is 1.267605633802817
Epsilon is 0.3216664411529491


  7%|▋         | 72/1000 [01:16<16:04,  1.04s/it]

Episode reward for episode 73 is 1.0
Average reward is 1.2638888888888888
Epsilon is 0.31717521295733486


  7%|▋         | 73/1000 [01:17<17:33,  1.14s/it]

Episode reward for episode 74 is 2.0
Average reward is 1.273972602739726
Epsilon is 0.31239976011415266


  7%|▋         | 74/1000 [01:18<17:20,  1.12s/it]

Episode reward for episode 75 is 1.0
Average reward is 1.2702702702702702
Epsilon is 0.307816149779966


  8%|▊         | 75/1000 [01:19<17:00,  1.10s/it]

Episode reward for episode 76 is 3.0
Average reward is 1.2933333333333332
Epsilon is 0.30364456069456613


  8%|▊         | 76/1000 [01:20<16:36,  1.08s/it]

Episode reward for episode 77 is 2.0
Average reward is 1.3026315789473684
Epsilon is 0.29970120474442996


  8%|▊         | 77/1000 [01:21<16:56,  1.10s/it]

Episode reward for episode 78 is 0.0
Average reward is 1.2857142857142858
Epsilon is 0.2955369963150988


  8%|▊         | 78/1000 [01:22<16:04,  1.05s/it]

Episode reward for episode 79 is 0.0
Average reward is 1.2692307692307692
Epsilon is 0.2923047674136415


  8%|▊         | 79/1000 [01:23<15:43,  1.02s/it]

Episode reward for episode 80 is 0.0
Average reward is 1.2531645569620253
Epsilon is 0.2890566379588664


  8%|▊         | 80/1000 [01:24<13:51,  1.11it/s]

Episode reward for episode 81 is 0.0
Average reward is 1.2375
Epsilon is 0.2866993269258678


  8%|▊         | 81/1000 [01:25<15:02,  1.02it/s]

Episode reward for episode 82 is 2.0
Average reward is 1.2469135802469136
Epsilon is 0.28241360589830095


  8%|▊         | 82/1000 [01:26<15:16,  1.00it/s]

Episode reward for episode 83 is 0.0
Average reward is 1.2317073170731707
Epsilon is 0.278765412357477


  8%|▊         | 83/1000 [01:28<18:27,  1.21s/it]

Episode reward for episode 84 is 5.0
Average reward is 1.2771084337349397
Epsilon is 0.2726795051119867


  8%|▊         | 84/1000 [01:30<20:48,  1.36s/it]

Episode reward for episode 85 is 3.0
Average reward is 1.2976190476190477
Epsilon is 0.2668848188625445


  8%|▊         | 85/1000 [01:31<22:13,  1.46s/it]

Episode reward for episode 86 is 5.0
Average reward is 1.3411764705882352
Epsilon is 0.2611747820781392


  9%|▊         | 86/1000 [01:32<19:26,  1.28s/it]

Episode reward for episode 87 is 1.0
Average reward is 1.3372093023255813
Epsilon is 0.2583456443893326


  9%|▊         | 87/1000 [01:34<20:17,  1.33s/it]

Episode reward for episode 88 is 4.0
Average reward is 1.367816091954023
Epsilon is 0.2541874604031935


  9%|▉         | 88/1000 [01:35<19:50,  1.31s/it]

Episode reward for episode 89 is 2.0
Average reward is 1.375
Epsilon is 0.2506547277895454


  9%|▉         | 89/1000 [01:36<17:47,  1.17s/it]

Episode reward for episode 90 is 0.0
Average reward is 1.3595505617977528
Epsilon is 0.2479945915662093


  9%|▉         | 90/1000 [01:37<20:31,  1.35s/it]

Episode reward for episode 91 is 4.0
Average reward is 1.3888888888888888
Epsilon is 0.24258786164818125


  9%|▉         | 91/1000 [01:39<20:07,  1.33s/it]

Episode reward for episode 92 is 2.0
Average reward is 1.3956043956043955
Epsilon is 0.23901098918231292


  9%|▉         | 92/1000 [01:40<21:21,  1.41s/it]

Episode reward for episode 93 is 4.0
Average reward is 1.423913043478261
Epsilon is 0.23458842849534167


  9%|▉         | 93/1000 [01:42<21:47,  1.44s/it]

Episode reward for episode 94 is 2.0
Average reward is 1.4301075268817205
Epsilon is 0.23042344546466995


  9%|▉         | 94/1000 [01:43<18:26,  1.22s/it]

Episode reward for episode 95 is 0.0
Average reward is 1.4148936170212767
Epsilon is 0.22853813109600177


 10%|▉         | 95/1000 [01:44<18:12,  1.21s/it]

Episode reward for episode 96 is 2.0
Average reward is 1.4210526315789473
Epsilon is 0.22551559260316587


 10%|▉         | 96/1000 [01:46<21:25,  1.42s/it]

Episode reward for episode 97 is 4.0
Average reward is 1.4479166666666667
Epsilon is 0.2203211657777552


 10%|▉         | 97/1000 [01:47<21:06,  1.40s/it]

Episode reward for episode 98 is 1.0
Average reward is 1.443298969072165
Epsilon is 0.21666895966401797


 10%|▉         | 98/1000 [01:49<22:01,  1.46s/it]

Episode reward for episode 99 is 5.0
Average reward is 1.4795918367346939
Epsilon is 0.21254346250152617


 10%|▉         | 99/1000 [01:50<19:30,  1.30s/it]

Episode reward for episode 100 is 1.0
Average reward is 1.4747474747474747
Epsilon is 0.21017066531244297


 10%|█         | 100/1000 [01:51<20:09,  1.34s/it]

Episode reward for episode 101 is 4.0
Average reward is 1.5
Epsilon is 0.20662885414705406


 10%|█         | 101/1000 [01:52<19:10,  1.28s/it]

Episode reward for episode 102 is 1.0
Average reward is 1.495049504950495
Epsilon is 0.20392386732968568


 10%|█         | 102/1000 [01:53<18:23,  1.23s/it]

Episode reward for episode 103 is 1.0
Average reward is 1.4901960784313726
Epsilon is 0.2013306041574461


 10%|█         | 103/1000 [01:55<19:29,  1.30s/it]

Episode reward for episode 104 is 3.0
Average reward is 1.5048543689320388
Epsilon is 0.1978822665311169


 10%|█         | 104/1000 [01:56<19:44,  1.32s/it]

Episode reward for episode 105 is 3.0
Average reward is 1.5192307692307692
Epsilon is 0.19460423655745246


 10%|█         | 105/1000 [01:57<18:48,  1.26s/it]

Episode reward for episode 106 is 2.0
Average reward is 1.5238095238095237
Epsilon is 0.19221124769273887


 11%|█         | 106/1000 [01:58<17:37,  1.18s/it]

Episode reward for episode 107 is 1.0
Average reward is 1.5188679245283019
Epsilon is 0.18991900927051003


 11%|█         | 107/1000 [02:00<19:21,  1.30s/it]

Episode reward for episode 108 is 4.0
Average reward is 1.5420560747663552
Epsilon is 0.18645448479787297


 11%|█         | 108/1000 [02:00<16:49,  1.13s/it]

Episode reward for episode 109 is 0.0
Average reward is 1.5277777777777777
Epsilon is 0.18483119794412217


 11%|█         | 109/1000 [02:02<18:37,  1.25s/it]

Episode reward for episode 110 is 3.0
Average reward is 1.5412844036697249
Epsilon is 0.18147041022776741


 11%|█         | 110/1000 [02:03<17:52,  1.20s/it]

Episode reward for episode 111 is 1.0
Average reward is 1.5363636363636364
Epsilon is 0.1794486469156219


 11%|█         | 111/1000 [02:04<15:26,  1.04s/it]

Episode reward for episode 112 is 0.0
Average reward is 1.5225225225225225
Epsilon is 0.17814651405193124


 11%|█         | 112/1000 [02:05<16:32,  1.12s/it]

Episode reward for episode 113 is 2.0
Average reward is 1.5267857142857142
Epsilon is 0.17551184579631188


 11%|█▏        | 113/1000 [02:06<14:18,  1.03it/s]

Episode reward for episode 114 is 0.0
Average reward is 1.5132743362831858
Epsilon is 0.17427271416516096


 11%|█▏        | 114/1000 [02:07<14:41,  1.00it/s]

Episode reward for episode 115 is 2.0
Average reward is 1.5175438596491229
Epsilon is 0.17221924874029015


 12%|█▏        | 115/1000 [02:08<16:42,  1.13s/it]

Episode reward for episode 116 is 3.0
Average reward is 1.5304347826086957
Epsilon is 0.16956362797526336


 12%|█▏        | 116/1000 [02:09<15:54,  1.08s/it]

Episode reward for episode 117 is 1.0
Average reward is 1.5258620689655173
Epsilon is 0.16777959007536303


 12%|█▏        | 117/1000 [02:11<17:35,  1.20s/it]

Episode reward for episode 118 is 4.0
Average reward is 1.547008547008547
Epsilon is 0.16496928718563186


 12%|█▏        | 118/1000 [02:12<18:26,  1.25s/it]

Episode reward for episode 119 is 4.0
Average reward is 1.5677966101694916
Epsilon is 0.1623262428777183


 12%|█▏        | 119/1000 [02:13<16:53,  1.15s/it]

Episode reward for episode 120 is 1.0
Average reward is 1.5630252100840336
Epsilon is 0.16062853458539028


 12%|█▏        | 120/1000 [02:14<15:08,  1.03s/it]

Episode reward for episode 121 is 0.0
Average reward is 1.55
Epsilon is 0.15945220240684796


 12%|█▏        | 121/1000 [02:15<16:47,  1.15s/it]

Episode reward for episode 122 is 3.0
Average reward is 1.56198347107438
Epsilon is 0.15696451908054848


 12%|█▏        | 122/1000 [02:16<16:54,  1.16s/it]

Episode reward for episode 123 is 2.0
Average reward is 1.5655737704918034
Epsilon is 0.15481748590641248


 12%|█▏        | 123/1000 [02:17<16:46,  1.15s/it]

Episode reward for episode 124 is 2.0
Average reward is 1.5691056910569106
Epsilon is 0.15283687975306467


 12%|█▏        | 124/1000 [02:18<16:06,  1.10s/it]

Episode reward for episode 125 is 2.0
Average reward is 1.5725806451612903
Epsilon is 0.15114740340775615


 12%|█▎        | 125/1000 [02:20<18:30,  1.27s/it]

Episode reward for episode 126 is 3.0
Average reward is 1.584
Epsilon is 0.14870486403430333


 13%|█▎        | 126/1000 [02:21<16:22,  1.12s/it]

Episode reward for episode 127 is 0.0
Average reward is 1.5714285714285714
Epsilon is 0.14755171720246849


 13%|█▎        | 127/1000 [02:22<15:59,  1.10s/it]

Episode reward for episode 128 is 1.0
Average reward is 1.5669291338582678
Epsilon is 0.14590427644515097


 13%|█▎        | 128/1000 [02:23<14:39,  1.01s/it]

Episode reward for episode 129 is 1.0
Average reward is 1.5625
Epsilon is 0.144601657330437


 13%|█▎        | 129/1000 [02:24<13:55,  1.04it/s]

Episode reward for episode 130 is 1.0
Average reward is 1.558139534883721
Epsilon is 0.14323854993511498


 13%|█▎        | 130/1000 [02:25<16:28,  1.14s/it]

Episode reward for episode 131 is 4.0
Average reward is 1.5769230769230769
Epsilon is 0.1408224252154037


 13%|█▎        | 131/1000 [02:26<16:24,  1.13s/it]

Episode reward for episode 132 is 2.0
Average reward is 1.5801526717557253
Epsilon is 0.1391903634081983


 13%|█▎        | 132/1000 [02:27<15:00,  1.04s/it]

Episode reward for episode 133 is 0.0
Average reward is 1.5681818181818181
Epsilon is 0.13798080095682497


 13%|█▎        | 133/1000 [02:28<13:24,  1.08it/s]

Episode reward for episode 134 is 0.0
Average reward is 1.556390977443609
Epsilon is 0.13701731408389012


 13%|█▎        | 134/1000 [02:29<13:18,  1.08it/s]

Episode reward for episode 135 is 1.0
Average reward is 1.5522388059701493
Epsilon is 0.1356677558115096


 14%|█▎        | 135/1000 [02:30<13:35,  1.06it/s]

Episode reward for episode 136 is 2.0
Average reward is 1.5555555555555556
Epsilon is 0.1342194853527423


 14%|█▎        | 136/1000 [02:31<14:55,  1.04s/it]

Episode reward for episode 137 is 3.0
Average reward is 1.5661764705882353
Epsilon is 0.13247398016511924


 14%|█▎        | 137/1000 [02:32<15:32,  1.08s/it]

Episode reward for episode 138 is 2.0
Average reward is 1.5693430656934306
Epsilon is 0.13084378940477798


 14%|█▍        | 138/1000 [02:33<15:03,  1.05s/it]

Episode reward for episode 139 is 2.0
Average reward is 1.5724637681159421
Epsilon is 0.1294778129727041


 14%|█▍        | 139/1000 [02:34<15:05,  1.05s/it]

Episode reward for episode 140 is 2.0
Average reward is 1.5755395683453237
Epsilon is 0.12806381232207445


 14%|█▍        | 140/1000 [02:36<17:51,  1.25s/it]

Episode reward for episode 141 is 5.0
Average reward is 1.6
Epsilon is 0.12571236916647593


 14%|█▍        | 141/1000 [02:37<18:28,  1.29s/it]

Episode reward for episode 142 is 3.0
Average reward is 1.6099290780141844
Epsilon is 0.12409686995278096


 14%|█▍        | 142/1000 [02:38<18:06,  1.27s/it]

Episode reward for episode 143 is 0.0
Average reward is 1.5985915492957747
Epsilon is 0.12252656210270334


 14%|█▍        | 143/1000 [02:39<15:39,  1.10s/it]

Episode reward for episode 144 is 0.0
Average reward is 1.5874125874125875
Epsilon is 0.12162828656542755


 14%|█▍        | 144/1000 [02:40<14:15,  1.00it/s]

Episode reward for episode 145 is 1.0
Average reward is 1.5833333333333333
Epsilon is 0.12061706821831433


 14%|█▍        | 145/1000 [02:41<15:12,  1.07s/it]

Episode reward for episode 146 is 3.0
Average reward is 1.5931034482758621
Epsilon is 0.11903981167918112


 15%|█▍        | 146/1000 [02:42<14:27,  1.02s/it]

Episode reward for episode 147 is 1.0
Average reward is 1.5890410958904109
Epsilon is 0.11787801001194421


 15%|█▍        | 147/1000 [02:44<16:51,  1.19s/it]

Episode reward for episode 148 is 4.0
Average reward is 1.6054421768707483
Epsilon is 0.11607430547838603


 15%|█▍        | 148/1000 [02:45<16:10,  1.14s/it]

Episode reward for episode 149 is 2.0
Average reward is 1.6081081081081081
Epsilon is 0.11481445793280538


 15%|█▍        | 149/1000 [02:45<14:09,  1.00it/s]

Episode reward for episode 150 is 0.0
Average reward is 1.5973154362416107
Epsilon is 0.11400254980359313


 15%|█▌        | 150/1000 [02:46<13:34,  1.04it/s]

Episode reward for episode 151 is 1.0
Average reward is 1.5933333333333333
Epsilon is 0.11295559513607428


 15%|█▌        | 151/1000 [02:47<14:13,  1.01s/it]

Episode reward for episode 152 is 2.0
Average reward is 1.596026490066225
Epsilon is 0.11158998278633486


 15%|█▌        | 152/1000 [02:49<17:28,  1.24s/it]

Episode reward for episode 153 is 5.0
Average reward is 1.618421052631579
Epsilon is 0.10961492748255791


 15%|█▌        | 153/1000 [02:50<16:53,  1.20s/it]

Episode reward for episode 154 is 2.0
Average reward is 1.6209150326797386
Epsilon is 0.10838673549932783


 15%|█▌        | 154/1000 [02:52<20:15,  1.44s/it]

Episode reward for episode 155 is 3.0
Average reward is 1.62987012987013
Epsilon is 0.10627345038448494


 16%|█▌        | 155/1000 [02:53<19:06,  1.36s/it]

Episode reward for episode 156 is 1.0
Average reward is 1.6258064516129032
Epsilon is 0.10505702445307995


 16%|█▌        | 156/1000 [02:55<19:26,  1.38s/it]

Episode reward for episode 157 is 3.0
Average reward is 1.6346153846153846
Epsilon is 0.10367346544593817


 16%|█▌        | 157/1000 [02:56<17:11,  1.22s/it]

Episode reward for episode 158 is 1.0
Average reward is 1.6305732484076434
Epsilon is 0.10279121092201575


 16%|█▌        | 158/1000 [02:57<20:00,  1.43s/it]

Episode reward for episode 159 is 4.0
Average reward is 1.6455696202531647
Epsilon is 0.10089256428942935


 16%|█▌        | 159/1000 [02:58<16:53,  1.21s/it]

Episode reward for episode 160 is 0.0
Average reward is 1.6352201257861636
Epsilon is 0.1001677975533059


 16%|█▌        | 160/1000 [02:59<16:35,  1.19s/it]

Episode reward for episode 161 is 3.0
Average reward is 1.64375
Epsilon is 0.099021653312377


 16%|█▌        | 161/1000 [03:00<15:17,  1.09s/it]

Episode reward for episode 162 is 1.0
Average reward is 1.639751552795031
Epsilon is 0.09820408683260902


 16%|█▌        | 162/1000 [03:02<16:46,  1.20s/it]

Episode reward for episode 163 is 2.0
Average reward is 1.6419753086419753
Epsilon is 0.09679364639289956


 16%|█▋        | 163/1000 [03:03<17:32,  1.26s/it]

Episode reward for episode 164 is 4.0
Average reward is 1.656441717791411
Epsilon is 0.09546899483500694


 16%|█▋        | 164/1000 [03:05<18:33,  1.33s/it]

Episode reward for episode 165 is 4.0
Average reward is 1.670731707317073
Epsilon is 0.09404862022131771


 16%|█▋        | 165/1000 [03:05<16:59,  1.22s/it]

Episode reward for episode 166 is 1.0
Average reward is 1.6666666666666667
Epsilon is 0.09326785023847675


 17%|█▋        | 166/1000 [03:07<16:44,  1.20s/it]

Episode reward for episode 167 is 3.0
Average reward is 1.6746987951807228
Epsilon is 0.0922203532738728


 17%|█▋        | 167/1000 [03:08<16:35,  1.19s/it]

Episode reward for episode 168 is 0.0
Average reward is 1.6646706586826348
Epsilon is 0.09118783218050411


 17%|█▋        | 168/1000 [03:08<14:20,  1.03s/it]

Episode reward for episode 169 is 0.0
Average reward is 1.6547619047619047
Epsilon is 0.09059235887943128


 17%|█▋        | 169/1000 [03:10<15:05,  1.09s/it]

Episode reward for episode 170 is 2.0
Average reward is 1.6568047337278107
Epsilon is 0.0894996632664704


 17%|█▋        | 170/1000 [03:11<15:39,  1.13s/it]

Episode reward for episode 171 is 1.0
Average reward is 1.6529411764705881
Epsilon is 0.08850603937719746


 17%|█▋        | 171/1000 [03:13<21:13,  1.54s/it]

Episode reward for episode 172 is 5.0
Average reward is 1.672514619883041
Epsilon is 0.08642821609912137


 17%|█▋        | 172/1000 [03:15<20:00,  1.45s/it]

Episode reward for episode 173 is 1.0
Average reward is 1.6686046511627908
Epsilon is 0.08541305834869489


 17%|█▋        | 173/1000 [03:15<17:08,  1.24s/it]

Episode reward for episode 174 is 0.0
Average reward is 1.6589595375722543
Epsilon is 0.08478809759983315


 17%|█▋        | 174/1000 [03:17<17:29,  1.27s/it]

Episode reward for episode 175 is 1.0
Average reward is 1.6551724137931034
Epsilon is 0.08378524585860812


 18%|█▊        | 175/1000 [03:19<19:52,  1.45s/it]

Episode reward for episode 176 is 2.0
Average reward is 1.6571428571428573
Epsilon is 0.08235986675445008


 18%|█▊        | 176/1000 [03:21<25:41,  1.87s/it]

Episode reward for episode 177 is 2.0
Average reward is 1.6590909090909092
Epsilon is 0.08016687233879169


 18%|█▊        | 177/1000 [03:24<28:47,  2.10s/it]

Episode reward for episode 178 is 2.0
Average reward is 1.6610169491525424
Epsilon is 0.07830698558142873


 18%|█▊        | 178/1000 [03:25<24:37,  1.80s/it]

Episode reward for episode 179 is 1.0
Average reward is 1.6573033707865168
Epsilon is 0.07751937844565845


 18%|█▊        | 179/1000 [03:26<22:23,  1.64s/it]

Episode reward for episode 180 is 3.0
Average reward is 1.664804469273743
Epsilon is 0.07659507161467173


 18%|█▊        | 180/1000 [03:28<23:33,  1.72s/it]

Episode reward for episode 181 is 1.0
Average reward is 1.6611111111111112
Epsilon is 0.07530824025548061


 18%|█▊        | 181/1000 [03:30<24:12,  1.77s/it]

Episode reward for episode 182 is 3.0
Average reward is 1.6685082872928176
Epsilon is 0.07403985919249237


 18%|█▊        | 182/1000 [03:31<20:13,  1.48s/it]

Episode reward for episode 183 is 0.0
Average reward is 1.6593406593406594
Epsilon is 0.07348075702751343


 18%|█▊        | 183/1000 [03:33<20:38,  1.52s/it]

Episode reward for episode 184 is 1.0
Average reward is 1.6557377049180328
Epsilon is 0.0724217670858096


 18%|█▊        | 184/1000 [03:35<23:05,  1.70s/it]

Episode reward for episode 185 is 1.0
Average reward is 1.6521739130434783
Epsilon is 0.0711581547310429


 18%|█▊        | 185/1000 [03:36<20:56,  1.54s/it]

Episode reward for episode 186 is 2.0
Average reward is 1.654054054054054
Epsilon is 0.07045702114445665


 19%|█▊        | 186/1000 [03:38<21:32,  1.59s/it]

Episode reward for episode 187 is 2.0
Average reward is 1.6559139784946237
Epsilon is 0.0693788358088805


 19%|█▊        | 187/1000 [03:39<21:32,  1.59s/it]

Episode reward for episode 188 is 0.0
Average reward is 1.6470588235294117
Epsilon is 0.06842043364963836


 19%|█▉        | 188/1000 [03:41<20:22,  1.51s/it]

Episode reward for episode 189 is 2.0
Average reward is 1.648936170212766
Epsilon is 0.06764234218218655


 19%|█▉        | 189/1000 [03:42<19:54,  1.47s/it]

Episode reward for episode 190 is 2.0
Average reward is 1.6507936507936507
Epsilon is 0.06682990092078744


 19%|█▉        | 190/1000 [03:44<20:45,  1.54s/it]

Episode reward for episode 191 is 2.0
Average reward is 1.6526315789473685
Epsilon is 0.06588424938266015


 19%|█▉        | 191/1000 [03:46<25:09,  1.87s/it]

Episode reward for episode 192 is 5.0
Average reward is 1.6701570680628273
Epsilon is 0.06446586906004985


 19%|█▉        | 192/1000 [03:47<21:41,  1.61s/it]

Episode reward for episode 193 is 1.0
Average reward is 1.6666666666666667
Epsilon is 0.06394425537830692


 19%|█▉        | 193/1000 [03:48<19:22,  1.44s/it]

Episode reward for episode 194 is 1.0
Average reward is 1.6632124352331605
Epsilon is 0.0633940314507273


 19%|█▉        | 194/1000 [03:50<19:46,  1.47s/it]

Episode reward for episode 195 is 3.0
Average reward is 1.6701030927835052
Epsilon is 0.06256881348983788


 20%|█▉        | 195/1000 [03:51<20:03,  1.49s/it]

Episode reward for episode 196 is 3.0
Average reward is 1.676923076923077
Epsilon is 0.06180942980344713


 20%|█▉        | 196/1000 [03:53<21:57,  1.64s/it]

Episode reward for episode 197 is 2.0
Average reward is 1.6785714285714286
Epsilon is 0.06082611325641531


 20%|█▉        | 197/1000 [03:59<38:41,  2.89s/it]

Episode reward for episode 198 is 4.0
Average reward is 1.6903553299492386
Epsilon is 0.0580584032491933


 20%|█▉        | 198/1000 [04:00<32:01,  2.40s/it]

Episode reward for episode 199 is 1.0
Average reward is 1.6868686868686869
Epsilon is 0.05747679372697423


 20%|█▉        | 199/1000 [04:02<28:39,  2.15s/it]

Episode reward for episode 200 is 4.0
Average reward is 1.6984924623115578
Epsilon is 0.05675675092663493


 20%|██        | 200/1000 [04:03<25:31,  1.91s/it]

Episode reward for episode 201 is 3.0
Average reward is 1.705
Epsilon is 0.056158856347735514


 20%|██        | 201/1000 [04:05<23:29,  1.76s/it]

Episode reward for episode 202 is 3.0
Average reward is 1.7114427860696517
Epsilon is 0.055556461843183885


 20%|██        | 202/1000 [04:06<19:58,  1.50s/it]

Episode reward for episode 203 is 1.0
Average reward is 1.7079207920792079
Epsilon is 0.055174518263094385


 20%|██        | 203/1000 [04:08<21:44,  1.64s/it]

Episode reward for episode 204 is 2.0
Average reward is 1.70935960591133
Epsilon is 0.0543472482906366


 20%|██        | 204/1000 [04:09<20:21,  1.54s/it]

Episode reward for episode 205 is 2.0
Average reward is 1.7107843137254901
Epsilon is 0.05385619200592462


 20%|██        | 205/1000 [04:11<21:31,  1.63s/it]

Episode reward for episode 206 is 4.0
Average reward is 1.721951219512195
Epsilon is 0.05310624407933018


 21%|██        | 206/1000 [04:12<20:49,  1.57s/it]

Episode reward for episode 207 is 3.0
Average reward is 1.7281553398058251
Epsilon is 0.05252867172920568


 21%|██        | 207/1000 [04:13<18:53,  1.43s/it]

Episode reward for episode 208 is 2.0
Average reward is 1.7294685990338163
Epsilon is 0.05211495814655484


 21%|██        | 208/1000 [04:15<19:35,  1.48s/it]

Episode reward for episode 209 is 3.0
Average reward is 1.7355769230769231
Epsilon is 0.051504232676029216


 21%|██        | 209/1000 [04:16<18:06,  1.37s/it]

Episode reward for episode 210 is 1.0
Average reward is 1.7320574162679425
Epsilon is 0.05107867545993963


 21%|██        | 210/1000 [04:17<17:51,  1.36s/it]

Episode reward for episode 211 is 3.0
Average reward is 1.7380952380952381
Epsilon is 0.05059149127772486


 21%|██        | 211/1000 [04:19<19:43,  1.50s/it]

Episode reward for episode 212 is 4.0
Average reward is 1.7488151658767772
Epsilon is 0.0499258276180886


 21%|██        | 212/1000 [04:20<18:13,  1.39s/it]

Episode reward for episode 213 is 1.0
Average reward is 1.7452830188679245
Epsilon is 0.049539309669167225


 21%|██▏       | 213/1000 [04:21<16:17,  1.24s/it]

Episode reward for episode 214 is 1.0
Average reward is 1.7417840375586855
Epsilon is 0.04923954013416758


 21%|██▏       | 214/1000 [04:22<15:01,  1.15s/it]

Episode reward for episode 215 is 0.0
Average reward is 1.733644859813084
Epsilon is 0.048902321120232795


 22%|██▏       | 215/1000 [04:23<14:19,  1.10s/it]

Episode reward for episode 216 is 1.0
Average reward is 1.7302325581395348
Epsilon is 0.048563278019330344


 22%|██▏       | 216/1000 [04:25<16:03,  1.23s/it]

Episode reward for episode 217 is 3.0
Average reward is 1.7361111111111112
Epsilon is 0.04804251937555173


 22%|██▏       | 217/1000 [04:26<16:27,  1.26s/it]

Episode reward for episode 218 is 1.0
Average reward is 1.7327188940092166
Epsilon is 0.04764160063929665


 22%|██▏       | 218/1000 [04:27<16:03,  1.23s/it]

Episode reward for episode 219 is 1.0
Average reward is 1.7293577981651376
Epsilon is 0.04725186359211314


 22%|██▏       | 219/1000 [04:29<16:37,  1.28s/it]

Episode reward for episode 220 is 2.0
Average reward is 1.730593607305936
Epsilon is 0.046787139621415265


 22%|██▏       | 220/1000 [04:31<21:43,  1.67s/it]

Episode reward for episode 221 is 2.0
Average reward is 1.731818181818182
Epsilon is 0.0459642841593869


 22%|██▏       | 221/1000 [04:33<23:18,  1.79s/it]

Episode reward for episode 222 is 6.0
Average reward is 1.751131221719457
Epsilon is 0.04533842219268193


 22%|██▏       | 222/1000 [04:36<25:48,  1.99s/it]

Episode reward for episode 223 is 5.0
Average reward is 1.7657657657657657
Epsilon is 0.04461908382599358


 22%|██▏       | 223/1000 [04:38<25:21,  1.96s/it]

Episode reward for episode 224 is 4.0
Average reward is 1.7757847533632287
Epsilon is 0.044093013626952074


 22%|██▏       | 224/1000 [04:39<22:42,  1.76s/it]

Episode reward for episode 225 is 3.0
Average reward is 1.78125
Epsilon is 0.04372482355726789


 22%|██▎       | 225/1000 [04:41<23:57,  1.86s/it]

Episode reward for episode 226 is 4.0
Average reward is 1.791111111111111
Epsilon is 0.0431298018051737


 23%|██▎       | 226/1000 [04:43<24:05,  1.87s/it]

Episode reward for episode 227 is 4.0
Average reward is 1.8008849557522124
Epsilon is 0.04260840879092783


 23%|██▎       | 227/1000 [04:45<23:34,  1.83s/it]

Episode reward for episode 228 is 4.0
Average reward is 1.8105726872246697
Epsilon is 0.04213858264681543


 23%|██▎       | 228/1000 [04:46<21:53,  1.70s/it]

Episode reward for episode 229 is 2.0
Average reward is 1.8114035087719298
Epsilon is 0.04176105506972052


 23%|██▎       | 229/1000 [04:48<24:15,  1.89s/it]

Episode reward for episode 230 is 5.0
Average reward is 1.8253275109170306
Epsilon is 0.04117289672679431


 23%|██▎       | 230/1000 [04:50<23:43,  1.85s/it]

Episode reward for episode 231 is 4.0
Average reward is 1.8347826086956522
Epsilon is 0.04075779642582164


 23%|██▎       | 231/1000 [04:51<20:54,  1.63s/it]

Episode reward for episode 232 is 1.0
Average reward is 1.8311688311688312
Epsilon is 0.04048559400136989


 23%|██▎       | 232/1000 [04:54<27:03,  2.11s/it]

Episode reward for episode 233 is 4.0
Average reward is 1.8405172413793103
Epsilon is 0.03973358304545455


 23%|██▎       | 233/1000 [04:56<25:07,  1.97s/it]

Episode reward for episode 234 is 3.0
Average reward is 1.8454935622317596
Epsilon is 0.03936993085025678


 23%|██▎       | 234/1000 [04:58<24:03,  1.88s/it]

Episode reward for episode 235 is 4.0
Average reward is 1.8547008547008548
Epsilon is 0.03899397659278685


 24%|██▎       | 235/1000 [05:01<28:22,  2.23s/it]

Episode reward for episode 236 is 5.0
Average reward is 1.8680851063829786
Epsilon is 0.0383296855629233


 24%|██▎       | 236/1000 [05:03<29:56,  2.35s/it]

Episode reward for episode 237 is 7.0
Average reward is 1.8898305084745763
Epsilon is 0.037763075763877316


 24%|██▎       | 237/1000 [05:05<25:23,  2.00s/it]

Episode reward for episode 238 is 1.0
Average reward is 1.8860759493670887
Epsilon is 0.03752313324094721


 24%|██▍       | 238/1000 [05:07<25:06,  1.98s/it]

Episode reward for episode 239 is 5.0
Average reward is 1.8991596638655461
Epsilon is 0.03714183398045785


 24%|██▍       | 239/1000 [05:09<28:54,  2.28s/it]

Episode reward for episode 240 is 8.0
Average reward is 1.9246861924686192
Epsilon is 0.03654562816102694


 24%|██▍       | 240/1000 [05:13<33:45,  2.67s/it]

Episode reward for episode 241 is 5.0
Average reward is 1.9375
Epsilon is 0.035884152395055435


 24%|██▍       | 241/1000 [05:15<32:42,  2.59s/it]

Episode reward for episode 242 is 7.0
Average reward is 1.95850622406639
Epsilon is 0.03546101829581293


 24%|██▍       | 242/1000 [05:17<28:24,  2.25s/it]

Episode reward for episode 243 is 2.0
Average reward is 1.9586776859504131
Epsilon is 0.03519736271904195


 24%|██▍       | 243/1000 [05:20<30:26,  2.41s/it]

Episode reward for episode 244 is 7.0
Average reward is 1.9794238683127572
Epsilon is 0.03475116270994927


 24%|██▍       | 244/1000 [05:26<44:43,  3.55s/it]

Episode reward for episode 245 is 7.0
Average reward is 2.0
Epsilon is 0.033718179273415694


 24%|██▍       | 245/1000 [05:28<39:58,  3.18s/it]

Episode reward for episode 246 is 6.0
Average reward is 2.016326530612245
Epsilon is 0.03334206608555698


 25%|██▍       | 246/1000 [05:32<40:18,  3.21s/it]

Episode reward for episode 247 is 5.0
Average reward is 2.0284552845528454
Epsilon is 0.03283174151926212


 25%|██▍       | 247/1000 [05:34<38:16,  3.05s/it]

Episode reward for episode 248 is 6.0
Average reward is 2.0445344129554655
Epsilon is 0.032442548223760456


 25%|██▍       | 248/1000 [05:37<38:54,  3.10s/it]

Episode reward for episode 249 is 6.0
Average reward is 2.060483870967742
Epsilon is 0.0320121892787935


 25%|██▍       | 249/1000 [05:43<49:25,  3.95s/it]

Episode reward for episode 250 is 5.0
Average reward is 2.072289156626506
Epsilon is 0.03124048403861739


 25%|██▌       | 250/1000 [05:45<41:59,  3.36s/it]

Episode reward for episode 251 is 4.0
Average reward is 2.08
Epsilon is 0.03097611894194468


 25%|██▌       | 251/1000 [05:48<39:27,  3.16s/it]

Episode reward for episode 252 is 3.0
Average reward is 2.0836653386454183
Epsilon is 0.03064959547080712


 25%|██▌       | 252/1000 [05:50<36:35,  2.93s/it]

Episode reward for episode 253 is 6.0
Average reward is 2.0992063492063493
Epsilon is 0.030370050659967306


 25%|██▌       | 253/1000 [05:53<34:26,  2.77s/it]

Episode reward for episode 254 is 7.0
Average reward is 2.1185770750988144
Epsilon is 0.030087750918346656


 25%|██▌       | 254/1000 [05:56<36:32,  2.94s/it]

Episode reward for episode 255 is 7.0
Average reward is 2.1377952755905514
Epsilon is 0.02971743706703073


 26%|██▌       | 255/1000 [05:59<36:59,  2.98s/it]

Episode reward for episode 256 is 7.0
Average reward is 2.156862745098039
Epsilon is 0.029379457330677997


 26%|██▌       | 256/1000 [06:01<32:46,  2.64s/it]

Episode reward for episode 257 is 4.0
Average reward is 2.1640625
Epsilon is 0.02918086941323715


 26%|██▌       | 257/1000 [06:03<31:45,  2.56s/it]

Episode reward for episode 258 is 8.0
Average reward is 2.186770428015564
Epsilon is 0.028929156267235422


 26%|██▌       | 258/1000 [06:06<33:12,  2.68s/it]

Episode reward for episode 259 is 13.0
Average reward is 2.2286821705426356
Epsilon is 0.028641031026542547


 26%|██▌       | 259/1000 [06:09<31:17,  2.53s/it]

Episode reward for episode 260 is 5.0
Average reward is 2.2393822393822393
Epsilon is 0.02842431281117195


 26%|██▌       | 260/1000 [06:11<29:15,  2.37s/it]

Episode reward for episode 261 is 5.0
Average reward is 2.25
Epsilon is 0.02823111847360087


 26%|██▌       | 261/1000 [06:13<29:33,  2.40s/it]

Episode reward for episode 262 is 6.0
Average reward is 2.264367816091954
Epsilon is 0.027995844006231115


 26%|██▌       | 262/1000 [06:17<34:11,  2.78s/it]

Episode reward for episode 263 is 9.0
Average reward is 2.2900763358778624
Epsilon is 0.027671574751990516


 26%|██▋       | 263/1000 [06:18<30:12,  2.46s/it]

Episode reward for episode 264 is 3.0
Average reward is 2.2927756653992395
Epsilon is 0.02751816361097921


 26%|██▋       | 264/1000 [06:21<30:25,  2.48s/it]

Episode reward for episode 265 is 6.0
Average reward is 2.3068181818181817
Epsilon is 0.027300347191276747


 26%|██▋       | 265/1000 [06:23<29:55,  2.44s/it]

Episode reward for episode 266 is 6.0
Average reward is 2.3207547169811322
Epsilon is 0.027130076099224423


 27%|██▋       | 266/1000 [06:25<28:01,  2.29s/it]

Episode reward for episode 267 is 6.0
Average reward is 2.3345864661654137
Epsilon is 0.02696934960876459


 27%|██▋       | 267/1000 [06:27<26:40,  2.18s/it]

Episode reward for episode 268 is 2.0
Average reward is 2.3333333333333335
Epsilon is 0.026816334796136924


 27%|██▋       | 268/1000 [06:31<32:24,  2.66s/it]

Episode reward for episode 269 is 12.0
Average reward is 2.3694029850746268
Epsilon is 0.02655037242934201


 27%|██▋       | 269/1000 [06:32<28:05,  2.31s/it]

Episode reward for episode 270 is 2.0
Average reward is 2.3680297397769516
Epsilon is 0.02643995728591423


 27%|██▋       | 270/1000 [06:36<33:20,  2.74s/it]

Episode reward for episode 271 is 14.0
Average reward is 2.411111111111111
Epsilon is 0.026171376492675175


 27%|██▋       | 271/1000 [06:39<34:46,  2.86s/it]

Episode reward for episode 272 is 9.0
Average reward is 2.4354243542435423
Epsilon is 0.025956731397930543


 27%|██▋       | 272/1000 [06:42<32:34,  2.68s/it]

Episode reward for episode 273 is 6.0
Average reward is 2.448529411764706
Epsilon is 0.025803852420377908


 27%|██▋       | 273/1000 [06:44<31:40,  2.61s/it]

Episode reward for episode 274 is 8.0
Average reward is 2.468864468864469
Epsilon is 0.025649244997523066


 27%|██▋       | 274/1000 [06:46<30:49,  2.55s/it]

Episode reward for episode 275 is 7.0
Average reward is 2.4854014598540144
Epsilon is 0.025502056368112966


 28%|██▊       | 275/1000 [06:48<28:33,  2.36s/it]

Episode reward for episode 276 is 6.0
Average reward is 2.498181818181818
Epsilon is 0.02537910448368514


 28%|██▊       | 276/1000 [06:51<28:09,  2.33s/it]

Episode reward for episode 277 is 8.0
Average reward is 2.5181159420289854
Epsilon is 0.025242098564004008


 28%|██▊       | 277/1000 [06:53<27:52,  2.31s/it]

Episode reward for episode 278 is 7.0
Average reward is 2.5342960288808665
Epsilon is 0.02510960400698571


 28%|██▊       | 278/1000 [06:57<33:14,  2.76s/it]

Episode reward for episode 279 is 8.0
Average reward is 2.553956834532374
Epsilon is 0.024894547877023752


 28%|██▊       | 279/1000 [06:59<31:04,  2.59s/it]

Episode reward for episode 280 is 9.0
Average reward is 2.5770609318996414
Epsilon is 0.02477752154819776


 28%|██▊       | 280/1000 [07:02<32:02,  2.67s/it]

Episode reward for episode 281 is 9.0
Average reward is 2.6
Epsilon is 0.024634470288957624


 28%|██▊       | 281/1000 [07:05<32:48,  2.74s/it]

Episode reward for episode 282 is 9.0
Average reward is 2.622775800711744
Epsilon is 0.024494803301075112


 28%|██▊       | 282/1000 [07:07<32:58,  2.76s/it]

Episode reward for episode 283 is 8.0
Average reward is 2.641843971631206
Epsilon is 0.024359345395630925


 28%|██▊       | 283/1000 [07:10<31:06,  2.60s/it]

Episode reward for episode 284 is 6.0
Average reward is 2.6537102473498235
Epsilon is 0.024254264567439888


 28%|██▊       | 284/1000 [07:13<33:04,  2.77s/it]

Episode reward for episode 285 is 11.0
Average reward is 2.683098591549296
Epsilon is 0.024100553265750432


 28%|██▊       | 285/1000 [07:16<34:57,  2.93s/it]

Episode reward for episode 286 is 11.0
Average reward is 2.712280701754386
Epsilon is 0.02395160532393132


 29%|██▊       | 286/1000 [07:18<32:25,  2.72s/it]

Episode reward for episode 287 is 6.0
Average reward is 2.7237762237762237
Epsilon is 0.023852498533199887


 29%|██▊       | 287/1000 [07:21<32:09,  2.71s/it]

Episode reward for episode 288 is 6.0
Average reward is 2.735191637630662
Epsilon is 0.02374387775611597


 29%|██▉       | 288/1000 [07:23<28:03,  2.36s/it]

Episode reward for episode 289 is 4.0
Average reward is 2.7395833333333335
Epsilon is 0.02367709084197014


 29%|██▉       | 289/1000 [07:25<26:46,  2.26s/it]

Episode reward for episode 290 is 6.0
Average reward is 2.7508650519031144
Epsilon is 0.02359348292928371


 29%|██▉       | 290/1000 [07:27<27:05,  2.29s/it]

Episode reward for episode 291 is 7.0
Average reward is 2.7655172413793103
Epsilon is 0.02350265729465379


 29%|██▉       | 291/1000 [07:29<26:38,  2.25s/it]

Episode reward for episode 292 is 7.0
Average reward is 2.7800687285223367
Epsilon is 0.023417543121795906


 29%|██▉       | 292/1000 [07:32<29:10,  2.47s/it]

Episode reward for episode 293 is 10.0
Average reward is 2.8047945205479454
Epsilon is 0.023306604749395814


 29%|██▉       | 293/1000 [07:35<31:02,  2.63s/it]

Episode reward for episode 294 is 9.0
Average reward is 2.825938566552901
Epsilon is 0.02319670920577627


 29%|██▉       | 294/1000 [07:37<27:06,  2.30s/it]

Episode reward for episode 295 is 4.0
Average reward is 2.8299319727891157
Epsilon is 0.02314408184851961


 30%|██▉       | 295/1000 [07:40<30:07,  2.56s/it]

Episode reward for episode 296 is 11.0
Average reward is 2.857627118644068
Epsilon is 0.02303533533431002


 30%|██▉       | 296/1000 [07:44<34:57,  2.98s/it]

Episode reward for episode 297 is 12.0
Average reward is 2.8885135135135136
Epsilon is 0.022909327362251635


 30%|██▉       | 297/1000 [07:47<34:45,  2.97s/it]

Episode reward for episode 298 is 7.0
Average reward is 2.9023569023569022
Epsilon is 0.022813760676044615


 30%|██▉       | 298/1000 [07:49<32:55,  2.81s/it]

Episode reward for episode 299 is 7.0
Average reward is 2.9161073825503356
Epsilon is 0.022737710281828385


 30%|██▉       | 299/1000 [07:51<28:25,  2.43s/it]

Episode reward for episode 300 is 3.0
Average reward is 2.9163879598662206
Epsilon is 0.022695333400505845


 30%|███       | 300/1000 [07:53<27:25,  2.35s/it]

Episode reward for episode 301 is 7.0
Average reward is 2.93
Epsilon is 0.02263246825868114


 30%|███       | 301/1000 [07:56<28:53,  2.48s/it]

Episode reward for episode 302 is 8.0
Average reward is 2.946843853820598
Epsilon is 0.02255262414746976


 30%|███       | 302/1000 [07:58<29:17,  2.52s/it]

Episode reward for episode 303 is 8.0
Average reward is 2.9635761589403975
Epsilon is 0.022478669458343817


 30%|███       | 303/1000 [08:01<29:59,  2.58s/it]

Episode reward for episode 304 is 13.0
Average reward is 2.9966996699669965
Epsilon is 0.022406376062933612


 30%|███       | 304/1000 [08:04<30:41,  2.65s/it]

Episode reward for episode 305 is 10.0
Average reward is 3.0197368421052633
Epsilon is 0.022332922812648995


 30%|███       | 305/1000 [08:06<28:58,  2.50s/it]

Episode reward for episode 306 is 13.0
Average reward is 3.0524590163934424
Epsilon is 0.022278510425871256


 31%|███       | 306/1000 [08:09<30:03,  2.60s/it]

Episode reward for episode 307 is 9.0
Average reward is 3.0718954248366015
Epsilon is 0.022211612544210012


 31%|███       | 307/1000 [08:11<27:10,  2.35s/it]

Episode reward for episode 308 is 5.0
Average reward is 3.0781758957654723
Epsilon is 0.022166952727233003


 31%|███       | 308/1000 [08:13<28:13,  2.45s/it]

Episode reward for episode 309 is 8.0
Average reward is 3.094155844155844
Epsilon is 0.022104171719910803


 31%|███       | 309/1000 [08:16<29:27,  2.56s/it]

Episode reward for episode 310 is 6.0
Average reward is 3.103559870550162
Epsilon is 0.022038311782151432


 31%|███       | 310/1000 [08:19<29:23,  2.56s/it]

Episode reward for episode 311 is 8.0
Average reward is 3.1193548387096772
Epsilon is 0.021981634298382964


 31%|███       | 311/1000 [08:20<26:12,  2.28s/it]

Episode reward for episode 312 is 5.0
Average reward is 3.12540192926045
Epsilon is 0.021945116568124534


 31%|███       | 312/1000 [08:22<25:50,  2.25s/it]

Episode reward for episode 313 is 6.0
Average reward is 3.1346153846153846
Epsilon is 0.02189974925701646


 31%|███▏      | 313/1000 [08:25<27:38,  2.41s/it]

Episode reward for episode 314 is 9.0
Average reward is 3.1533546325878596
Epsilon is 0.021843603181333722


 31%|███▏      | 314/1000 [08:28<27:15,  2.38s/it]

Episode reward for episode 315 is 7.0
Average reward is 3.1656050955414012
Epsilon is 0.021795928049356938


 32%|███▏      | 315/1000 [08:29<22:43,  1.99s/it]

Episode reward for episode 316 is 2.0
Average reward is 3.1619047619047618
Epsilon is 0.021774150838251863


 32%|███▏      | 316/1000 [08:30<21:25,  1.88s/it]

Episode reward for episode 317 is 4.0
Average reward is 3.1645569620253164
Epsilon is 0.021742501818856796


 32%|███▏      | 317/1000 [08:34<28:45,  2.53s/it]

Episode reward for episode 318 is 14.0
Average reward is 3.1987381703470033
Epsilon is 0.021667827543613388


 32%|███▏      | 318/1000 [08:38<32:02,  2.82s/it]

Episode reward for episode 319 is 11.0
Average reward is 3.2232704402515724
Epsilon is 0.02160788862849551


 32%|███▏      | 319/1000 [08:42<35:04,  3.09s/it]

Episode reward for episode 320 is 13.0
Average reward is 3.2539184952978055
Epsilon is 0.021542372642073954


 32%|███▏      | 320/1000 [08:45<35:49,  3.16s/it]

Episode reward for episode 321 is 11.0
Average reward is 3.278125
Epsilon is 0.0214869423649884


 32%|███▏      | 321/1000 [08:48<35:21,  3.12s/it]

Episode reward for episode 322 is 10.0
Average reward is 3.2990654205607477
Epsilon is 0.02143694870055509


 32%|███▏      | 322/1000 [08:51<36:13,  3.21s/it]

Episode reward for episode 323 is 9.0
Average reward is 3.3167701863354035
Epsilon is 0.02138530718250898


 32%|███▏      | 323/1000 [08:54<34:23,  3.05s/it]

Episode reward for episode 324 is 5.0
Average reward is 3.321981424148607
Epsilon is 0.021347595519132324


 32%|███▏      | 324/1000 [08:57<34:03,  3.02s/it]

Episode reward for episode 325 is 9.0
Average reward is 3.3395061728395063
Epsilon is 0.021307768052571948


 32%|███▎      | 325/1000 [09:00<32:25,  2.88s/it]

Episode reward for episode 326 is 8.0
Average reward is 3.353846153846154
Epsilon is 0.02127191279791831


 33%|███▎      | 326/1000 [09:02<29:30,  2.63s/it]

Episode reward for episode 327 is 9.0
Average reward is 3.371165644171779
Epsilon is 0.02124423627440025


 33%|███▎      | 327/1000 [09:04<29:19,  2.61s/it]

Episode reward for episode 328 is 8.0
Average reward is 3.385321100917431
Epsilon is 0.02121012287915391


 33%|███▎      | 328/1000 [09:06<26:30,  2.37s/it]

Episode reward for episode 329 is 5.0
Average reward is 3.3902439024390243
Epsilon is 0.021185923632238313


 33%|███▎      | 329/1000 [09:08<26:00,  2.33s/it]

Episode reward for episode 330 is 7.0
Average reward is 3.4012158054711246
Epsilon is 0.021158726905849388


 33%|███▎      | 330/1000 [09:12<29:48,  2.67s/it]

Episode reward for episode 331 is 11.0
Average reward is 3.4242424242424243
Epsilon is 0.02111462932698192


 33%|███▎      | 331/1000 [09:15<33:07,  2.97s/it]

Episode reward for episode 332 is 15.0
Average reward is 3.459214501510574
Epsilon is 0.021071566833190145


 33%|███▎      | 332/1000 [09:19<35:21,  3.18s/it]

Episode reward for episode 333 is 12.0
Average reward is 3.4849397590361444
Epsilon is 0.021029550096674992


 33%|███▎      | 333/1000 [09:22<35:20,  3.18s/it]

Episode reward for episode 334 is 14.0
Average reward is 3.5165165165165164
Epsilon is 0.02099453688485947


 33%|███▎      | 334/1000 [09:25<33:31,  3.02s/it]

Episode reward for episode 335 is 8.0
Average reward is 3.529940119760479
Epsilon is 0.02096610950447182


 34%|███▎      | 335/1000 [09:27<31:52,  2.88s/it]

Episode reward for episode 336 is 8.0
Average reward is 3.5432835820895523
Epsilon is 0.020939057944891905


 34%|███▎      | 336/1000 [09:30<31:21,  2.83s/it]

Episode reward for episode 337 is 8.0
Average reward is 3.556547619047619
Epsilon is 0.02091203392372953


 34%|███▎      | 337/1000 [09:32<29:10,  2.64s/it]

Episode reward for episode 338 is 7.0
Average reward is 3.5667655786350148
Epsilon is 0.02089004959549991


 34%|███▍      | 338/1000 [09:35<31:04,  2.82s/it]

Episode reward for episode 339 is 11.0
Average reward is 3.588757396449704
Epsilon is 0.02085926483999813


 34%|███▍      | 339/1000 [09:37<26:56,  2.45s/it]

Episode reward for episode 340 is 4.0
Average reward is 3.589970501474926
Epsilon is 0.0208442740844226


 34%|███▍      | 340/1000 [09:40<27:53,  2.53s/it]

Episode reward for episode 341 is 9.0
Average reward is 3.6058823529411765
Epsilon is 0.020818830568558547


 34%|███▍      | 341/1000 [09:41<24:56,  2.27s/it]

Episode reward for episode 342 is 4.0
Average reward is 3.6070381231671553
Epsilon is 0.02080502810226435


 34%|███▍      | 342/1000 [09:44<27:03,  2.47s/it]

Episode reward for episode 343 is 10.0
Average reward is 3.625730994152047
Epsilon is 0.02077905152521859


 34%|███▍      | 343/1000 [09:47<28:02,  2.56s/it]

Episode reward for episode 344 is 8.0
Average reward is 3.638483965014577
Epsilon is 0.02075678347826516


 34%|███▍      | 344/1000 [09:50<29:58,  2.74s/it]

Episode reward for episode 345 is 10.0
Average reward is 3.6569767441860463
Epsilon is 0.02073060808931608


 34%|███▍      | 345/1000 [09:53<28:24,  2.60s/it]

Episode reward for episode 346 is 6.0
Average reward is 3.663768115942029
Epsilon is 0.020713139594850105


 35%|███▍      | 346/1000 [09:55<27:49,  2.55s/it]

Episode reward for episode 347 is 8.0
Average reward is 3.6763005780346822
Epsilon is 0.0206940036268171


 35%|███▍      | 347/1000 [09:57<27:08,  2.49s/it]

Episode reward for episode 348 is 8.0
Average reward is 3.6887608069164264
Epsilon is 0.020676327337865567


 35%|███▍      | 348/1000 [10:00<27:10,  2.50s/it]

Episode reward for episode 349 is 8.0
Average reward is 3.7011494252873565
Epsilon is 0.020658047545592615


 35%|███▍      | 349/1000 [10:02<26:29,  2.44s/it]

Episode reward for episode 350 is 7.0
Average reward is 3.7106017191977076
Epsilon is 0.020641415329127066


 35%|███▌      | 350/1000 [10:05<26:49,  2.48s/it]

Episode reward for episode 351 is 7.0
Average reward is 3.72
Epsilon is 0.020624828483987244


 35%|███▌      | 351/1000 [10:07<25:14,  2.33s/it]

Episode reward for episode 352 is 9.0
Average reward is 3.735042735042735
Epsilon is 0.02061123236304237


 35%|███▌      | 352/1000 [10:10<27:08,  2.51s/it]

Episode reward for episode 353 is 9.0
Average reward is 3.75
Epsilon is 0.020592100975140158


 35%|███▌      | 353/1000 [10:13<28:46,  2.67s/it]

Episode reward for episode 354 is 9.0
Average reward is 3.764872521246459
Epsilon is 0.020572536899401472


 35%|███▌      | 354/1000 [10:15<27:35,  2.56s/it]

Episode reward for episode 355 is 7.0
Average reward is 3.774011299435028
Epsilon is 0.020557954370898485


 36%|███▌      | 355/1000 [10:18<27:14,  2.53s/it]

Episode reward for episode 356 is 8.0
Average reward is 3.7859154929577463
Epsilon is 0.020542982551800943


 36%|███▌      | 356/1000 [10:21<29:35,  2.76s/it]

Episode reward for episode 357 is 10.0
Average reward is 3.803370786516854
Epsilon is 0.020523992403172944


 36%|███▌      | 357/1000 [10:23<27:27,  2.56s/it]

Episode reward for episode 358 is 5.0
Average reward is 3.80672268907563
Epsilon is 0.020512487943805745


 36%|███▌      | 358/1000 [10:25<26:38,  2.49s/it]

Episode reward for episode 359 is 7.0
Average reward is 3.815642458100559
Epsilon is 0.0204999345483004


 36%|███▌      | 359/1000 [10:28<27:38,  2.59s/it]

Episode reward for episode 360 is 11.0
Average reward is 3.8356545961002784
Epsilon is 0.020484480501749724


 36%|███▌      | 360/1000 [10:30<25:26,  2.39s/it]

Episode reward for episode 361 is 5.0
Average reward is 3.838888888888889
Epsilon is 0.02047403311705276


 36%|███▌      | 361/1000 [10:33<27:25,  2.58s/it]

Episode reward for episode 362 is 9.0
Average reward is 3.853185595567867
Epsilon is 0.020459104193989115


 36%|███▌      | 362/1000 [10:36<27:15,  2.56s/it]

Episode reward for episode 363 is 8.0
Average reward is 3.8646408839779007
Epsilon is 0.02044669551430415


 36%|███▋      | 363/1000 [10:38<27:56,  2.63s/it]

Episode reward for episode 364 is 10.0
Average reward is 3.881542699724518
Epsilon is 0.020433233648525843


 36%|███▋      | 364/1000 [10:40<25:16,  2.38s/it]

Episode reward for episode 365 is 5.0
Average reward is 3.8846153846153846
Epsilon is 0.020424655047516876


 36%|███▋      | 365/1000 [10:43<27:58,  2.64s/it]

Episode reward for episode 366 is 10.0
Average reward is 3.9013698630136986
Epsilon is 0.02040939365976431


 37%|███▋      | 366/1000 [10:46<27:46,  2.63s/it]

Episode reward for episode 367 is 8.0
Average reward is 3.912568306010929
Epsilon is 0.02039793042818705


 37%|███▋      | 367/1000 [10:49<29:12,  2.77s/it]

Episode reward for episode 368 is 9.0
Average reward is 3.9264305177111716
Epsilon is 0.020384320633191847


 37%|███▋      | 368/1000 [10:51<25:37,  2.43s/it]

Episode reward for episode 369 is 4.0
Average reward is 3.926630434782609
Epsilon is 0.02037754025035207


 37%|███▋      | 369/1000 [10:55<32:29,  3.09s/it]

Episode reward for episode 370 is 13.0
Average reward is 3.951219512195122
Epsilon is 0.020358840208042326


 37%|███▋      | 370/1000 [10:59<34:00,  3.24s/it]

Episode reward for episode 371 is 6.0
Average reward is 3.9567567567567568
Epsilon is 0.020345252898097625


 37%|███▋      | 371/1000 [11:02<33:34,  3.20s/it]

Episode reward for episode 372 is 10.0
Average reward is 3.973045822102426
Epsilon is 0.020333778362192


 37%|███▋      | 372/1000 [11:05<33:41,  3.22s/it]

Episode reward for episode 373 is 11.0
Average reward is 3.9919354838709675
Epsilon is 0.020322169300864848


 37%|███▋      | 373/1000 [11:09<34:26,  3.30s/it]

Episode reward for episode 374 is 12.0
Average reward is 4.013404825737266
Epsilon is 0.020310156555100458


 37%|███▋      | 374/1000 [11:11<30:00,  2.88s/it]

Episode reward for episode 375 is 5.0
Average reward is 4.016042780748663
Epsilon is 0.020303954246897994


 38%|███▊      | 375/1000 [11:12<26:47,  2.57s/it]

Episode reward for episode 376 is 5.0
Average reward is 4.018666666666666
Epsilon is 0.02029769729642968


 38%|███▊      | 376/1000 [11:17<33:30,  3.22s/it]

Episode reward for episode 377 is 15.0
Average reward is 4.047872340425532
Epsilon is 0.020282612637081778


 38%|███▊      | 377/1000 [11:20<31:24,  3.02s/it]

Episode reward for episode 378 is 7.0
Average reward is 4.0557029177718835
Epsilon is 0.020275304342700647


 38%|███▊      | 378/1000 [11:22<28:21,  2.74s/it]

Episode reward for episode 379 is 6.0
Average reward is 4.060846560846561
Epsilon is 0.020269098420043603


 38%|███▊      | 379/1000 [11:24<25:38,  2.48s/it]

Episode reward for episode 380 is 6.0
Average reward is 4.065963060686016
Epsilon is 0.020263769914315683


 38%|███▊      | 380/1000 [11:27<26:59,  2.61s/it]

Episode reward for episode 381 is 10.0
Average reward is 4.081578947368421
Epsilon is 0.020255360733444243


 38%|███▊      | 381/1000 [11:31<31:15,  3.03s/it]

Episode reward for episode 382 is 13.0
Average reward is 4.10498687664042
Epsilon is 0.02024436846445269


 38%|███▊      | 382/1000 [11:32<26:48,  2.60s/it]

Episode reward for episode 383 is 4.0
Average reward is 4.104712041884817
Epsilon is 0.020240009183322666


 38%|███▊      | 383/1000 [11:34<24:29,  2.38s/it]

Episode reward for episode 384 is 5.0
Average reward is 4.107049608355092
Epsilon is 0.020235115571402205


 38%|███▊      | 384/1000 [11:38<30:07,  2.93s/it]

Episode reward for episode 385 is 17.0
Average reward is 4.140625
Epsilon is 0.02022459015019209


 38%|███▊      | 385/1000 [11:41<29:03,  2.83s/it]

Episode reward for episode 386 is 9.0
Average reward is 4.153246753246753
Epsilon is 0.020218170569653623


 39%|███▊      | 386/1000 [11:44<28:54,  2.83s/it]

Episode reward for episode 387 is 9.0
Average reward is 4.16580310880829
Epsilon is 0.02021155334448138


 39%|███▊      | 387/1000 [11:45<24:16,  2.38s/it]

Episode reward for episode 388 is 3.0
Average reward is 4.162790697674419
Epsilon is 0.020208612233473682


 39%|███▉      | 388/1000 [11:48<24:23,  2.39s/it]

Episode reward for episode 389 is 7.0
Average reward is 4.170103092783505
Epsilon is 0.020203095677644996


 39%|███▉      | 389/1000 [11:50<25:09,  2.47s/it]

Episode reward for episode 390 is 9.0
Average reward is 4.182519280205655
Epsilon is 0.020197093293253594


 39%|███▉      | 390/1000 [11:54<29:09,  2.87s/it]

Episode reward for episode 391 is 13.0
Average reward is 4.205128205128205
Epsilon is 0.020189175884182144


 39%|███▉      | 391/1000 [11:56<25:50,  2.55s/it]

Episode reward for episode 392 is 6.0
Average reward is 4.209718670076726
Epsilon is 0.020185392868415183


 39%|███▉      | 392/1000 [11:59<27:04,  2.67s/it]

Episode reward for episode 393 is 10.0
Average reward is 4.224489795918367
Epsilon is 0.020179554213330647


 39%|███▉      | 393/1000 [12:01<25:35,  2.53s/it]

Episode reward for episode 394 is 6.0
Average reward is 4.229007633587786
Epsilon is 0.020175296212601532


 39%|███▉      | 394/1000 [12:04<26:48,  2.65s/it]

Episode reward for episode 395 is 9.0
Average reward is 4.241116751269035
Epsilon is 0.02016987743160528


 40%|███▉      | 395/1000 [12:06<26:16,  2.61s/it]

Episode reward for episode 396 is 7.0
Average reward is 4.248101265822785
Epsilon is 0.020165319040619554


 40%|███▉      | 396/1000 [12:09<27:31,  2.73s/it]

Episode reward for episode 397 is 12.0
Average reward is 4.267676767676767
Epsilon is 0.020159984540142235


 40%|███▉      | 397/1000 [12:12<28:19,  2.82s/it]

Episode reward for episode 398 is 10.0
Average reward is 4.282115869017632
Epsilon is 0.020154667428076492


 40%|███▉      | 398/1000 [12:15<27:11,  2.71s/it]

Episode reward for episode 399 is 7.0
Average reward is 4.288944723618091
Epsilon is 0.02015078834826695


 40%|███▉      | 399/1000 [12:17<25:23,  2.53s/it]

Episode reward for episode 400 is 7.0
Average reward is 4.295739348370927
Epsilon is 0.02014733032673928


 40%|████      | 400/1000 [12:19<22:27,  2.25s/it]

Episode reward for episode 401 is 4.0
Average reward is 4.295
Epsilon is 0.020144817913809607


 40%|████      | 401/1000 [12:21<22:55,  2.30s/it]

Episode reward for episode 402 is 7.0
Average reward is 4.301745635910224
Epsilon is 0.020141072957615466


 40%|████      | 402/1000 [12:25<27:11,  2.73s/it]

Episode reward for episode 403 is 12.0
Average reward is 4.32089552238806
Epsilon is 0.020135433018062348


 40%|████      | 403/1000 [12:29<31:19,  3.15s/it]

Episode reward for episode 404 is 12.0
Average reward is 4.3399503722084365
Epsilon is 0.02012942184513148


 40%|████      | 404/1000 [12:37<46:38,  4.70s/it]

Episode reward for episode 405 is 9.0
Average reward is 4.351485148514851
Epsilon is 0.02011809355938558


 40%|████      | 405/1000 [12:40<41:46,  4.21s/it]

Episode reward for episode 406 is 10.0
Average reward is 4.3654320987654325
Epsilon is 0.020114305785515627


 41%|████      | 406/1000 [12:43<37:31,  3.79s/it]

Episode reward for episode 407 is 8.0
Average reward is 4.374384236453202
Epsilon is 0.020110838832468534


 41%|████      | 407/1000 [12:46<36:00,  3.64s/it]

Episode reward for episode 408 is 11.0
Average reward is 4.3906633906633905
Epsilon is 0.02010702657696328


 41%|████      | 408/1000 [12:54<49:14,  4.99s/it]

Episode reward for episode 409 is 15.0
Average reward is 4.416666666666667
Epsilon is 0.020098265911471302


 41%|████      | 409/1000 [13:01<54:22,  5.52s/it]

Episode reward for episode 410 is 19.0
Average reward is 4.452322738386308
Epsilon is 0.020091622528563266


 41%|████      | 410/1000 [13:03<44:13,  4.50s/it]

Episode reward for episode 411 is 5.0
Average reward is 4.453658536585366
Epsilon is 0.020089539259833325


 41%|████      | 411/1000 [13:06<38:55,  3.97s/it]

Episode reward for episode 412 is 8.0
Average reward is 4.462287104622871
Epsilon is 0.020086962516975483


 41%|████      | 412/1000 [13:09<37:08,  3.79s/it]

Episode reward for episode 413 is 10.0
Average reward is 4.475728155339806
Epsilon is 0.02008402187540725


 41%|████▏     | 413/1000 [13:12<32:42,  3.34s/it]

Episode reward for episode 414 is 6.0
Average reward is 4.479418886198547
Epsilon is 0.020081980153415727


 41%|████▏     | 414/1000 [13:16<35:26,  3.63s/it]

Episode reward for episode 415 is 13.0
Average reward is 4.5
Epsilon is 0.02007840417562448


 42%|████▏     | 415/1000 [13:18<31:39,  3.25s/it]

Episode reward for episode 416 is 6.0
Average reward is 4.5036144578313255
Epsilon is 0.020076575500322436


 42%|████▏     | 416/1000 [13:22<33:33,  3.45s/it]

Episode reward for episode 417 is 14.0
Average reward is 4.5264423076923075
Epsilon is 0.020073455309439537


 42%|████▏     | 417/1000 [13:25<31:44,  3.27s/it]

Episode reward for episode 418 is 8.0
Average reward is 4.534772182254197
Epsilon is 0.020071355696992053


 42%|████▏     | 418/1000 [13:29<32:31,  3.35s/it]

Episode reward for episode 419 is 12.0
Average reward is 4.552631578947368
Epsilon is 0.020068626392183743


 42%|████▏     | 419/1000 [13:32<32:55,  3.40s/it]

Episode reward for episode 420 is 10.0
Average reward is 4.56563245823389
Epsilon is 0.020066133616402575


 42%|████▏     | 420/1000 [13:36<33:21,  3.45s/it]

Episode reward for episode 421 is 11.0
Average reward is 4.580952380952381
Epsilon is 0.020063795151135683


 42%|████▏     | 421/1000 [13:38<29:50,  3.09s/it]

Episode reward for episode 422 is 6.0
Average reward is 4.584323040380047
Epsilon is 0.020062269839192293


 42%|████▏     | 422/1000 [13:41<29:52,  3.10s/it]

Episode reward for episode 423 is 9.0
Average reward is 4.5947867298578196
Epsilon is 0.020060284630447157


 42%|████▏     | 423/1000 [13:43<26:11,  2.72s/it]

Episode reward for episode 424 is 4.0
Average reward is 4.59338061465721
Epsilon is 0.020059126379972456


 42%|████▏     | 424/1000 [13:45<25:06,  2.61s/it]

Episode reward for episode 425 is 6.0
Average reward is 4.596698113207547
Epsilon is 0.02005763195490072


 42%|████▎     | 425/1000 [13:51<33:02,  3.45s/it]

Episode reward for episode 426 is 11.0
Average reward is 4.6117647058823525
Epsilon is 0.02005448237127253


 43%|████▎     | 426/1000 [13:53<28:59,  3.03s/it]

Episode reward for episode 427 is 5.0
Average reward is 4.612676056338028
Epsilon is 0.02005337151552183


 43%|████▎     | 427/1000 [13:57<30:54,  3.24s/it]

Episode reward for episode 428 is 13.0
Average reward is 4.6323185011709604
Epsilon is 0.020051299304083166


 43%|████▎     | 428/1000 [14:00<31:04,  3.26s/it]

Episode reward for episode 429 is 13.0
Average reward is 4.651869158878505
Epsilon is 0.020049574529637036


 43%|████▎     | 429/1000 [14:02<29:03,  3.05s/it]

Episode reward for episode 430 is 6.0
Average reward is 4.655011655011655
Epsilon is 0.020048282886758365


 43%|████▎     | 430/1000 [14:05<28:33,  3.01s/it]

Episode reward for episode 431 is 8.0
Average reward is 4.662790697674419
Epsilon is 0.020046874657915176


 43%|████▎     | 431/1000 [14:09<29:04,  3.07s/it]

Episode reward for episode 432 is 8.0
Average reward is 4.6705336426914155
Epsilon is 0.02004525337198859


 43%|████▎     | 432/1000 [14:11<26:48,  2.83s/it]

Episode reward for episode 433 is 11.0
Average reward is 4.685185185185185
Epsilon is 0.020044153720209436


 43%|████▎     | 432/1000 [14:11<18:40,  1.97s/it]


KeyboardInterrupt: 