In [25]:
! pip install tqdm, pygame, pytorchvision


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [1]:
import torch.nn.functional as F
import torch 
from torch import nn
import torch.optim as optim
import torchvision.transforms as T

from torch.distributions import Categorical
import numpy as np
import pandas as pd
import gym

from collections import deque
from PIL import Image
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Actor(nn.Module):
    def __init__(self, input_size, output_size):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(in_features=input_size, out_features=256)
        self.linear2 = nn.Linear(in_features=256, out_features=256)
        self.linear3 = nn.Linear(in_features=256, out_features=output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        x = self.linear3(x)
        output = F.softmax(x, dim=-1)
        return output

class Critic(nn.Module):
    def __init__(self, input_size, output_size):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(in_features=input_size, out_features=256)
        self.linear2 = nn.Linear(in_features=256, out_features=256)
        self.linear3 = nn.Linear(in_features=256, out_features=output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.linear1(x))
        x = self.relu(self.linear2(x))
        output = self.linear3(x)
        return output

In [4]:


class CartPoleVanillaDQN():
    def __init__(self, env, typ="dense"):
      self.env = env
      self.typ = typ
      self.max_episodes = 30000
      self.max_steps = 500
      self.gamma = 0.99
      self.epsilon = 1.0
      self.epsilon_min = 0.001
      self.epsilon_decay_lamda = self.epsilon_min**(1/self.max_episodes)
      self.target_update_counter = 0
      self.target_update_frequency = 10
      self.buffer_minibatch_size = 120
      self.replay_buffer_capacity=5000
      self.replay_buffer = deque([],maxlen=self.replay_buffer_capacity)
      self.resize = T.Compose([T.ToPILImage(),
                    T.Resize(84, interpolation=Image.CUBIC),
                    T.Grayscale(),
                    T.ToTensor()])  
      env.reset()
      _, _, self.state_screen_h, self.state_screen_w = self.screen_preprocessor(self.env.render()).shape
      self.state_count = env.observation_space.shape[0]
      self.action_count = env.action_space.n
      self.actor = Actor(self.state_count, self.action_count) if self.typ=="dense" else None
      self.critic = Critic(self.state_count, 1) if self.typ=="dense" else None
      # self.rms_optimizer = optim.RMSprop(self.q_value_dqn.parameters())#, lr=0.001)#, weight_decay=0.05)
      # self.loss_func = F.smooth_l1_loss
      self.actor_optimizer = optim.SGD(self.actor.parameters(), lr=0.0009)#optim.RMSprop(self.q_value_dqn.parameters(), lr=0.001)#, weight_decay=0.05)
      self.critic_optimizer = optim.SGD(self.critic.parameters(), lr=0.0009)
      self.loss_func = F.mse_loss #F.smooth_l1_loss
      
      self.timestep_list = []
      self.rewards_list = []
      self.epsilon_list = []
      
      self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
      # self.q_value_dqn#.to(self.device)
      # self.target_dqn#.to(self.device)

      self.test_max_episodes = 10
      self.test_timestep_list = []
      self.test_rewards_list = []
      self.test_epsilon_list = []
      self.trained_policy_path = "./baseline/cartpole.pth"

    def initialize_buffer(self):
      for i in range(self.replay_buffer_capacity):
        self.env.reset()
        current_state = self.screen_preprocessor(self.env.render())
        done = False
        while not done:  
          current_action = self.choose_action(current_state)#.to(self.device)
          curr_obs, curr_reward, done, truncated, info = self.env.step(current_action.item())
          next_state = self.screen_preprocessor(self.env.render())

          self.add_to_replay_buffer((current_state, current_action, curr_reward, next_state, done))
          current_state = next_state

    def screen_preprocessor(self,state_screen):
      def crop(variable,tw,th):
         c, h, w = variable.shape
         x1 = int(round((w - tw) / 2.))
         y1 = int(round((h - th) / 2.))
         return variable[:,y1:y1+th,x1:x1+tw]
      state_screen = state_screen.transpose((2,0,1))
      screen_c, screen_h, screen_w = state_screen.shape
      screen = self.resize(torch.from_numpy(state_screen))
      screen = crop(screen, 60, 60)
      screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
      screen = torch.from_numpy(screen)
      return screen.unsqueeze(0)
    
    def choose_action(self, state, test=False):
      random_no = np.random.random()
      if self.epsilon > random_no and not test:
          return torch.tensor([[np.random.choice(self.action_count)]])
      else:
          return self.get_q_value_for_state(state)#.to(self.device))
    
    def get_q_value_for_state(self, state):
      q_values = self.q_value_dqn(state)
      return q_values.max(1)[1].view(1, 1)

    def eval(self):
      self.q_value_dqn.load_state_dict(torch.load(self.trained_policy_path))

      for episode in tqdm(range(0, self.test_max_episodes), unit='episodes'):
        obs = self.env.reset()[0]
        done = False
        reward_sum = 0
        steps = 0
        current_state = torch.from_numpy(obs).unsqueeze(0) if self.typ=="dense" else self.screen_preprocessor(self.env.render())
        while not done:
          tt = time.time()

          current_action = self.choose_action(current_state, test=True)#.to(self.device)
          curr_obs, curr_reward, done, truncated, info = self.env.step(current_action.item())
          self.env.render()
          
          reward_sum += curr_reward
          next_state = torch.from_numpy(curr_obs).unsqueeze(0) if self.typ=="dense" else self.screen_preprocessor(self.env.render())

          steps +=1
          
          current_state = next_state
          if steps==500 or reward_sum==470:
            done=True

        self.update_target_net(episode)
            
        self.test_timestep_list.append(steps)
        self.test_rewards_list.append(reward_sum)


    def train(self, current_state, reward, next_state, done, action_log_prob, I):
      
      current_state_val = self.critic(current_state)
      next_state_val = self.critic(next_state) if not done else 0
  
      delta = reward + self.gamma*next_state_val - current_state_val
      actor_loss = -1*I*action_log_prob*delta
      critic_loss = delta**2

      self.actor_optimizer.zero_grad()
      self.critic.zero_grad()
      
      (actor_loss + critic_loss).backward()
      # critic_loss.backward()
      # for param in self.q_value_dqn.parameters():
      #     param.grad.data.clamp_(-1, 1)
      self.actor_optimizer.step()
      self.critic_optimizer.step()

      return actor_loss.item(), critic_loss.item()
      
    def learn(self):
      
      for episode in tqdm(range(0, self.max_episodes), unit='episodes'):
        obs = self.env.reset()[0]
        done = False
        reward_sum = 0
        steps = 0
        a_running_loss=0
        c_running_loss=0
        I = 1
        current_state = torch.from_numpy(obs).unsqueeze(0) if self.typ=="dense" else self.screen_preprocessor(self.env.render())
        while not done:
          action_prob = self.actor(current_state)
          actions_cat = Categorical(action_prob)

          current_action = actions_cat.sample()
          action_log_prob = actions_cat.log_prob(current_action)

          curr_obs, curr_reward, done, truncated, info = self.env.step(current_action.item())

          reward_sum += curr_reward
          
          next_state = torch.from_numpy(curr_obs).unsqueeze(0) if self.typ=="dense" else self.screen_preprocessor(self.env.render())
  
          actor_loss, critic_loss = self.train(current_state, curr_reward, next_state, done, action_log_prob, I)
          I = self.gamma*I
          steps +=1
          a_running_loss += actor_loss
          c_running_loss += critic_loss
          current_state = next_state

          if steps==self.max_steps or reward_sum==470:
            done=True

        # self.update_target_net(episode)
            
        self.timestep_list.append(steps)
        self.rewards_list.append(reward_sum)
        self.epsilon_list.append(self.epsilon)
        self.epsilon = max(0.01, self.epsilon*self.epsilon_decay_lamda)
      
        if episode % 100 == 99:
          # print(f"Buffer_size: {len(self.replay_buffer)}|Train_time:{sum(self.tm[-100:])/100}| Sub_t:{sum(self.tr[-100:])/100}")
          # print(f'[Episode: {episode+1}] - Steps: {steps+1} | Rewards: {reward_sum+1} | Epsilon: {self.epsilon} | loss: {running_loss / 10:.3f}')
          print(f'[Episode: {episode+1}] - Steps: {sum(self.timestep_list[-100:])/100} | Rewards: {sum(self.rewards_list[-100:])/100} | Epsilon: {self.epsilon} | loss: {a_running_loss / 10:.3f}, {c_running_loss / 10:.3f}')
          running_loss = 0.0
          if sum(self.rewards_list[-100:])/100 >= 470:
            break
          # p.print()
        
    def update_target_net(self, episode):
       if episode % self.target_update_frequency == self.target_update_frequency-1:   
          self.critic.load_state_dict(self.actor.state_dict())
    
    def get_minibatch_from_replay_buffer(self):
      exp_batch = random.sample(self.replay_buffer, self.buffer_minibatch_size) #max(len(self.replay_buffer), self.buffer_minibatch_size))
      current_state_batch = torch.cat(tuple([exp[0] for exp in exp_batch]),dim=0)
      current_action_batch = torch.cat(tuple([exp[1] for exp in exp_batch]),dim=0)
      next_state_batch = torch.cat(tuple([exp[3] for exp in exp_batch]),dim=0)
      current_reward_batch = torch.tensor(tuple([exp[2] for exp in exp_batch]))#.to(self.device)
      current_done_batch = torch.tensor(tuple([exp[4] for exp in exp_batch]))#.to(self.device)
      return current_state_batch, current_action_batch, next_state_batch, current_reward_batch, current_done_batch
    
    def add_to_replay_buffer(self, experience):
       self.replay_buffer.append(experience)



In [5]:
env = gym.make('CartPole-v1', render_mode="rgb_array")
env.reset()
agent = CartPoleVanillaDQN(env)
agent.learn()

  T.Resize(84, interpolation=Image.CUBIC),
  0%|          | 101/30000 [00:08<40:19, 12.36episodes/s] 

[Episode: 100] - Steps: 22.46 | Rewards: 22.46 | Epsilon: 0.9772372209558056 | loss: 0.776, 1.159


  1%|          | 202/30000 [00:16<40:28, 12.27episodes/s]

[Episode: 200] - Steps: 22.5 | Rewards: 22.5 | Epsilon: 0.9549925860214258 | loss: 1.343, 2.216


  1%|          | 301/30000 [00:27<49:05, 10.08episodes/s]  

[Episode: 300] - Steps: 28.61 | Rewards: 28.61 | Epsilon: 0.933254300796976 | loss: 1.821, 3.258


  1%|▏         | 401/30000 [00:40<1:14:48,  6.59episodes/s]

[Episode: 400] - Steps: 36.97 | Rewards: 36.97 | Epsilon: 0.91201083935589 | loss: 2.467, 5.327


  2%|▏         | 500/30000 [00:58<1:05:11,  7.54episodes/s]

[Episode: 500] - Steps: 54.54 | Rewards: 54.54 | Epsilon: 0.8912509381337214 | loss: 2.121, 5.380


  2%|▏         | 601/30000 [01:12<1:13:10,  6.70episodes/s]

[Episode: 600] - Steps: 64.32 | Rewards: 64.32 | Epsilon: 0.8709635899560526 | loss: 1.293, 4.097


  2%|▏         | 700/30000 [01:27<1:15:13,  6.49episodes/s]

[Episode: 700] - Steps: 68.07 | Rewards: 68.07 | Epsilon: 0.8511380382023443 | loss: 2.100, 8.516


  3%|▎         | 801/30000 [01:43<1:13:14,  6.64episodes/s]

[Episode: 800] - Steps: 74.64 | Rewards: 74.64 | Epsilon: 0.8317637711026356 | loss: 1.389, 6.061


  3%|▎         | 901/30000 [01:59<1:20:04,  6.06episodes/s]

[Episode: 900] - Steps: 75.02 | Rewards: 75.02 | Epsilon: 0.8128305161640605 | loss: 1.063, 5.238


  3%|▎         | 1001/30000 [02:15<1:00:21,  8.01episodes/s]

[Episode: 1000] - Steps: 72.86 | Rewards: 72.86 | Epsilon: 0.79432823472424 | loss: 0.467, 1.805


  4%|▎         | 1100/30000 [02:31<1:02:12,  7.74episodes/s]

[Episode: 1100] - Steps: 73.52 | Rewards: 73.52 | Epsilon: 0.7762471166286468 | loss: 0.317, 4.285


  4%|▍         | 1200/30000 [02:46<1:21:13,  5.91episodes/s]

[Episode: 1200] - Steps: 68.9 | Rewards: 68.9 | Epsilon: 0.7585775750291356 | loss: 0.543, 5.128


  4%|▍         | 1301/30000 [03:02<1:19:18,  6.03episodes/s]

[Episode: 1300] - Steps: 73.58 | Rewards: 73.58 | Epsilon: 0.7413102413008664 | loss: 0.180, 1.592


  5%|▍         | 1401/30000 [03:20<1:16:38,  6.22episodes/s]

[Episode: 1400] - Steps: 83.2 | Rewards: 83.2 | Epsilon: 0.7244359600749358 | loss: 0.176, 1.231


  5%|▌         | 1500/30000 [03:39<1:22:27,  5.76episodes/s]

[Episode: 1500] - Steps: 85.51 | Rewards: 85.51 | Epsilon: 0.7079457843840814 | loss: 0.245, 2.352


  5%|▌         | 1600/30000 [03:57<1:29:42,  5.28episodes/s]

[Episode: 1600] - Steps: 86.39 | Rewards: 86.39 | Epsilon: 0.6918309709188775 | loss: -0.053, 0.670


  6%|▌         | 1700/30000 [04:16<2:23:13,  3.29episodes/s]

[Episode: 1700] - Steps: 83.81 | Rewards: 83.81 | Epsilon: 0.6760829753919201 | loss: 0.195, 1.056


  6%|▌         | 1800/30000 [04:35<1:15:32,  6.22episodes/s]

[Episode: 1800] - Steps: 88.68 | Rewards: 88.68 | Epsilon: 0.6606934480075326 | loss: 0.223, 1.581


  6%|▋         | 1900/30000 [04:55<1:49:35,  4.27episodes/s]

[Episode: 1900] - Steps: 92.35 | Rewards: 92.35 | Epsilon: 0.6456542290345904 | loss: 0.045, 0.575


  7%|▋         | 2000/30000 [05:15<1:37:08,  4.80episodes/s]

[Episode: 2000] - Steps: 94.77 | Rewards: 94.77 | Epsilon: 0.6309573444801263 | loss: -0.074, 0.666


  7%|▋         | 2101/30000 [05:37<1:17:30,  6.00episodes/s]

[Episode: 2100] - Steps: 99.6 | Rewards: 99.6 | Epsilon: 0.6165950018614137 | loss: -0.291, 1.528


  7%|▋         | 2200/30000 [05:57<1:48:27,  4.27episodes/s]

[Episode: 2200] - Steps: 96.47 | Rewards: 96.47 | Epsilon: 0.602559586074288 | loss: -0.111, 1.060


  8%|▊         | 2301/30000 [06:20<1:28:49,  5.20episodes/s]

[Episode: 2300] - Steps: 106.19 | Rewards: 106.19 | Epsilon: 0.5888436553555179 | loss: 0.075, 2.601


  8%|▊         | 2400/30000 [06:45<2:28:01,  3.11episodes/s]

[Episode: 2400] - Steps: 114.62 | Rewards: 114.62 | Epsilon: 0.5754399373370849 | loss: 0.073, 1.597


  8%|▊         | 2500/30000 [07:11<3:06:58,  2.45episodes/s]

[Episode: 2500] - Steps: 125.59 | Rewards: 125.59 | Epsilon: 0.5623413251902758 | loss: 0.459, 4.124


  9%|▊         | 2600/30000 [07:37<1:58:12,  3.86episodes/s]

[Episode: 2600] - Steps: 121.81 | Rewards: 121.81 | Epsilon: 0.5495408738575507 | loss: -0.161, 1.641


  9%|▉         | 2700/30000 [08:04<2:08:15,  3.55episodes/s]

[Episode: 2700] - Steps: 127.34 | Rewards: 127.34 | Epsilon: 0.5370317963701782 | loss: -0.087, 0.609


  9%|▉         | 2801/30000 [08:34<2:18:37,  3.27episodes/s]

[Episode: 2800] - Steps: 137.63 | Rewards: 137.63 | Epsilon: 0.5248074602496976 | loss: 0.256, 8.577


 10%|▉         | 2900/30000 [09:04<2:45:23,  2.73episodes/s]

[Episode: 2900] - Steps: 139.87 | Rewards: 139.87 | Epsilon: 0.5128613839912889 | loss: 0.346, 3.794


 10%|█         | 3000/30000 [09:33<2:52:44,  2.61episodes/s]

[Episode: 3000] - Steps: 135.48 | Rewards: 135.48 | Epsilon: 0.5011872336271957 | loss: 0.498, 4.190


 10%|█         | 3100/30000 [10:08<2:50:35,  2.63episodes/s]

[Episode: 3100] - Steps: 162.11 | Rewards: 162.11 | Epsilon: 0.48977881936836887 | loss: -0.198, 14.168


 11%|█         | 3200/30000 [10:52<3:04:53,  2.42episodes/s]

[Episode: 3200] - Steps: 210.87 | Rewards: 210.87 | Epsilon: 0.47863009232256 | loss: 0.114, 5.093


 11%|█         | 3300/30000 [11:39<2:58:07,  2.50episodes/s]

[Episode: 3300] - Steps: 221.7 | Rewards: 221.7 | Epsilon: 0.467735141287119 | loss: 0.006, 1.525


 11%|█▏        | 3400/30000 [12:20<2:54:29,  2.54episodes/s]

[Episode: 3400] - Steps: 192.62 | Rewards: 192.62 | Epsilon: 0.4570881896147952 | loss: -0.018, 3.780


 12%|█▏        | 3500/30000 [12:59<3:13:47,  2.28episodes/s]

[Episode: 3500] - Steps: 185.17 | Rewards: 185.17 | Epsilon: 0.4466835921508827 | loss: 0.001, 0.388


 12%|█▏        | 3600/30000 [13:38<2:44:42,  2.67episodes/s]

[Episode: 3600] - Steps: 184.04 | Rewards: 184.04 | Epsilon: 0.4365158322400849 | loss: -0.143, 2.684


 12%|█▏        | 3700/30000 [14:24<2:45:27,  2.65episodes/s]

[Episode: 3700] - Steps: 218.13 | Rewards: 218.13 | Epsilon: 0.42657951880151124 | loss: -0.126, 0.213


 13%|█▎        | 3800/30000 [15:10<3:20:33,  2.18episodes/s]

[Episode: 3800] - Steps: 217.42 | Rewards: 217.42 | Epsilon: 0.41686938347025354 | loss: -0.136, 1.672


 13%|█▎        | 3900/30000 [15:54<2:29:10,  2.92episodes/s]

[Episode: 3900] - Steps: 212.28 | Rewards: 212.28 | Epsilon: 0.4073802778040305 | loss: -0.202, 0.787


 13%|█▎        | 4000/30000 [16:42<3:25:42,  2.11episodes/s]

[Episode: 4000] - Steps: 225.78 | Rewards: 225.78 | Epsilon: 0.3981071705534148 | loss: -0.107, 0.600


 14%|█▎        | 4100/30000 [17:32<4:09:10,  1.73episodes/s]

[Episode: 4100] - Steps: 233.37 | Rewards: 233.37 | Epsilon: 0.38904514499419784 | loss: 0.019, 1.335


 14%|█▍        | 4200/30000 [18:20<3:21:11,  2.14episodes/s]

[Episode: 4200] - Steps: 230.97 | Rewards: 230.97 | Epsilon: 0.38018939632047805 | loss: -0.030, 0.492


 14%|█▍        | 4300/30000 [19:08<3:44:52,  1.90episodes/s]

[Episode: 4300] - Steps: 223.8 | Rewards: 223.8 | Epsilon: 0.37153522909708914 | loss: 0.010, 0.602


 15%|█▍        | 4400/30000 [19:59<4:21:07,  1.63episodes/s]

[Episode: 4400] - Steps: 242.39 | Rewards: 242.39 | Epsilon: 0.3630780547700179 | loss: -0.143, 1.112


 15%|█▌        | 4500/30000 [20:51<4:44:12,  1.50episodes/s]

[Episode: 4500] - Steps: 236.23 | Rewards: 236.23 | Epsilon: 0.35481338923349187 | loss: -0.114, 0.441


 15%|█▌        | 4600/30000 [21:44<2:52:14,  2.46episodes/s]

[Episode: 4600] - Steps: 236.76 | Rewards: 236.76 | Epsilon: 0.34673685045244834 | loss: -0.092, 1.537


 16%|█▌        | 4700/30000 [22:32<4:41:21,  1.50episodes/s]

[Episode: 4700] - Steps: 227.56 | Rewards: 227.56 | Epsilon: 0.33884415613911933 | loss: -0.121, 0.969


 16%|█▌        | 4800/30000 [23:21<3:13:10,  2.17episodes/s]

[Episode: 4800] - Steps: 231.53 | Rewards: 231.53 | Epsilon: 0.33113112148250806 | loss: -0.006, 2.322


 16%|█▋        | 4900/30000 [24:11<3:47:00,  1.84episodes/s]

[Episode: 4900] - Steps: 226.79 | Rewards: 226.79 | Epsilon: 0.3235936569295455 | loss: -0.171, 0.222


 17%|█▋        | 5000/30000 [25:04<2:58:04,  2.34episodes/s]

[Episode: 5000] - Steps: 244.15 | Rewards: 244.15 | Epsilon: 0.3162277660167557 | loss: -0.309, 0.718


 17%|█▋        | 5100/30000 [25:57<4:01:44,  1.72episodes/s]

[Episode: 5100] - Steps: 247.52 | Rewards: 247.52 | Epsilon: 0.3090295432512767 | loss: -0.045, 0.595


 17%|█▋        | 5200/30000 [26:52<3:01:53,  2.27episodes/s]

[Episode: 5200] - Steps: 255.7 | Rewards: 255.7 | Epsilon: 0.3019951720401198 | loss: -0.153, 0.458


 18%|█▊        | 5300/30000 [27:42<3:52:30,  1.77episodes/s]

[Episode: 5300] - Steps: 235.26 | Rewards: 235.26 | Epsilon: 0.2951209226665569 | loss: 0.010, 2.028


 18%|█▊        | 5400/30000 [28:33<3:20:58,  2.04episodes/s]

[Episode: 5400] - Steps: 235.73 | Rewards: 235.73 | Epsilon: 0.28840315031257935 | loss: -0.059, 0.371


 18%|█▊        | 5500/30000 [29:24<3:02:31,  2.24episodes/s]

[Episode: 5500] - Steps: 238.0 | Rewards: 238.0 | Epsilon: 0.2818382931263646 | loss: -0.037, 0.692


 19%|█▊        | 5600/30000 [30:16<3:20:43,  2.03episodes/s]

[Episode: 5600] - Steps: 238.94 | Rewards: 238.94 | Epsilon: 0.27542287033373625 | loss: -0.442, 0.481


 19%|█▉        | 5700/30000 [31:03<2:22:37,  2.84episodes/s]

[Episode: 5700] - Steps: 222.48 | Rewards: 222.48 | Epsilon: 0.26915348039261155 | loss: -0.091, 0.910


 19%|█▉        | 5800/30000 [31:57<3:51:55,  1.74episodes/s]

[Episode: 5800] - Steps: 248.9 | Rewards: 248.9 | Epsilon: 0.26302679918945865 | loss: -0.186, 0.565


 20%|█▉        | 5900/30000 [32:52<4:48:10,  1.39episodes/s]

[Episode: 5900] - Steps: 256.74 | Rewards: 256.74 | Epsilon: 0.2570395782768071 | loss: -0.022, 0.766


 20%|██        | 6000/30000 [33:47<3:40:47,  1.81episodes/s]

[Episode: 6000] - Steps: 258.23 | Rewards: 258.23 | Epsilon: 0.2511886431508792 | loss: -0.056, 0.778


 20%|██        | 6100/30000 [34:44<5:01:35,  1.32episodes/s]

[Episode: 6100] - Steps: 252.07 | Rewards: 252.07 | Epsilon: 0.24547089156842472 | loss: -0.280, 0.496


 21%|██        | 6200/30000 [35:41<4:26:29,  1.49episodes/s]

[Episode: 6200] - Steps: 256.52 | Rewards: 256.52 | Epsilon: 0.23988329190187127 | loss: -0.035, 0.770


 21%|██        | 6300/30000 [36:36<3:16:22,  2.01episodes/s]

[Episode: 6300] - Steps: 256.92 | Rewards: 256.92 | Epsilon: 0.23442288153191498 | loss: -0.088, 0.128


 21%|██▏       | 6400/30000 [37:31<3:05:40,  2.12episodes/s]

[Episode: 6400] - Steps: 254.48 | Rewards: 254.48 | Epsilon: 0.22908676527670066 | loss: -0.020, 0.550


 22%|██▏       | 6500/30000 [38:22<2:46:59,  2.35episodes/s]

[Episode: 6500] - Steps: 235.59 | Rewards: 235.59 | Epsilon: 0.2238721138567579 | loss: -0.155, 0.828


 22%|██▏       | 6600/30000 [39:20<3:47:38,  1.71episodes/s]

[Episode: 6600] - Steps: 261.19 | Rewards: 261.19 | Epsilon: 0.2187761623948797 | loss: -0.118, 2.135


 22%|██▏       | 6700/30000 [40:13<4:21:15,  1.49episodes/s]

[Episode: 6700] - Steps: 246.47 | Rewards: 246.47 | Epsilon: 0.21379620895014828 | loss: -0.082, 2.117


 23%|██▎       | 6800/30000 [41:09<4:06:13,  1.57episodes/s]

[Episode: 6800] - Steps: 259.64 | Rewards: 259.64 | Epsilon: 0.20892961308532967 | loss: -0.030, 0.542


 23%|██▎       | 6900/30000 [42:01<3:27:10,  1.86episodes/s]

[Episode: 6900] - Steps: 248.17 | Rewards: 248.17 | Epsilon: 0.20417379446687922 | loss: -0.050, 0.738


 23%|██▎       | 7000/30000 [42:55<3:31:14,  1.81episodes/s]

[Episode: 7000] - Steps: 254.73 | Rewards: 254.73 | Epsilon: 0.19952623149681473 | loss: 0.039, 1.513


 24%|██▎       | 7100/30000 [43:50<3:47:41,  1.68episodes/s]

[Episode: 7100] - Steps: 257.78 | Rewards: 257.78 | Epsilon: 0.194984459975732 | loss: -0.041, 0.744


 24%|██▍       | 7200/30000 [44:46<4:16:40,  1.48episodes/s]

[Episode: 7200] - Steps: 264.05 | Rewards: 264.05 | Epsilon: 0.19054607179625285 | loss: -0.023, 0.451


 24%|██▍       | 7300/30000 [45:41<2:53:10,  2.18episodes/s]

[Episode: 7300] - Steps: 255.16 | Rewards: 255.16 | Epsilon: 0.1862087136662154 | loss: -0.095, 0.168


 25%|██▍       | 7400/30000 [46:41<3:37:19,  1.73episodes/s]

[Episode: 7400] - Steps: 278.28 | Rewards: 278.28 | Epsilon: 0.18197008586092747 | loss: -0.287, 0.359


 25%|██▌       | 7500/30000 [47:39<3:56:13,  1.59episodes/s]

[Episode: 7500] - Steps: 267.2 | Rewards: 267.2 | Epsilon: 0.17782794100382204 | loss: -0.048, 0.420


 25%|██▌       | 7591/30000 [48:34<3:41:53,  1.68episodes/s]