In [1]:
import gym
import numpy as np
import random
from collections import Counter, deque
from tensorboardX import SummaryWriter

import torch
import torch.nn as nn
import torch.nn.utils as nn_utils
import torch.nn.functional as F
import torch.optim as optim

  for external in metadata.entry_points().get(self.group, []):


In [2]:
# Hyperparameters
batch_size = 32

gamma = 0.99

eps_start=1.0
eps_decay = 0.995
eps_min = 0.1      # Minimal exploration rate (epsilon-greedy)

num_rounds = 500
num_episodes = 50
learning_limit = 100
replay_limit = 1000  # Number of steps until starting replay
weight_update = 1000 # Number of steps until updating the target weights


In [3]:
#create model
class Model(nn.Module):
    def __init__(self, input_shape, n_actions):
        super(Model, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(input_shape[0]*input_shape[1], 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, n_actions)
            
        )

    def forward(self, x):
        # flatten the observation space Box to linear tensor
        tensor_array = torch.from_numpy(state)
        x_flat = torch.flatten(tensor_array).to(torch.float32)
        return self.net(x_flat)

In [4]:
class SchedulerEnv(gym.Env):

    def __init__(self):
        
        #starting parameters
        num_gps = 100
        num_slots = 32
        num_pre_booked = 750
        to_book = [2,1,2,2,1,1,1,3,3,1,2,1,3,2,1,1,2,1,3,2,3,2]
        num_to_book = len(to_book)
        agent_pos = [0,0]
        reward_decay = 0.95
        
        #set parameters for the day
        self.num_gps = num_gps
        self.num_slots = num_slots
        self.num_pre_booked = num_pre_booked
        self.to_book = to_book
        self.num_to_book = num_to_book
        self.diary_slots = num_gps*num_slots
        self.agent_pos = agent_pos
        self.reward_decay = reward_decay

        #set action space to move around the grid
        self.action_space = gym.spaces.Discrete(4) #up, down, left, right
        
        #set observation space 
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(self.num_slots, self.num_gps), dtype=np.int32)
   
    #creates daily diary for each gp, randomly populates prebooked appointments and resets parameters
    def reset(self):

        #creates zero filled dataframe with row per time slot and column per gp
        self.state = np.zeros((self.num_slots, self.num_gps),dtype=float)

        #randomly enters a 1 for each pre booked appointments
        pre_booked = self.num_pre_booked
        while pre_booked>0:
            pre_booked -= 1
            self.state[np.random.randint(self.num_slots), np.random.randint(self.num_gps)] = 1
            
        #randomly sets the agent start space
        self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]

        #resets parameters for new episode
        self.done = False
        self.reward = 0
        self.appt_idx = 0
        self.decay_steps = 1
        
        #print('starting state', self.state.sum(), self.state)

        return self.state
    
    #calculates new position of the agent based on the action
    def move_agent(self, action):

        #set boundaries for the grid
        max_row = self.num_slots - 1
        max_col = self.num_gps - 1

        #setting new co-ordinates for the agent
        new_row = self.agent_pos[0]
        new_col = self.agent_pos[1]

        #calculate what the new position may be based on the action without going out the grid
        if action == 0:
            #print('up')
            new_row = max(self.agent_pos[0] - 1, 0)
        if action == 1:
            #print('down')
            new_row = min(self.agent_pos[0] + 1, max_row)
        if action == 2:
            #print('left')
            new_col = max(self.agent_pos[1] - 1, 0)
        if action == 3:
            #print('right')
            new_col = min(self.agent_pos[1] + 1, max_col)

        new_pos = [new_row, new_col]
        #print('new pos', new_pos)

        return new_pos

    #checks if we can look to book appointment starting here
    def check_bookable(self):
        return self.state[self.agent_pos[0], self.agent_pos[1]] == 0.0
    
    #action if we can't book the appointment
    def invalid_booking(self):
        #print('cant book')
        self.decay_steps += 1
        self.reward = -1
        
    #action if we can book the appointment
    def valid_booking(self):
        #print('go ahead and book')
        self.appt_idx += 1
        self.decay_steps = 1
        self.reward = 1
    
    #checks if the appointment fits
    def check_and_book(self):
        
        max_row = self.num_slots - 1
        cells_to_check = self.to_book[self.appt_idx]
        
        if cells_to_check==1:
            #print('good to check for single')
            if self.state[self.agent_pos[0], self.agent_pos[1]] == 0:
                self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                self.valid_booking()
            else:
                #print('single taken')
                self.invalid_booking()

        if cells_to_check==2:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]<max_row:
                #check the next cells is also 0.0
                #print('good to check for double')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+1), self.agent_pos[1]]
                    #print('after booking', self.agent_pos)
                else:
                    #print('double taken')
                    self.invalid_booking()
            else:
                #print('not for double')
                self.invalid_booking()
                
        if cells_to_check==3:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+1<max_row:
                #print('good to check for treble')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+2), self.agent_pos[1]]
                else:
                    #print('treble taken')
                    self.invalid_booking()
            else:
                #print('not for treble')
                self.invalid_booking()
                
        if cells_to_check==4:
            #check we're not at the bottom of the grid
            if self.agent_pos[0]+2<max_row:
                #check the next cells is also 0.0
                #print('good for quad')
                if self.state[self.agent_pos[0], self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+1), self.agent_pos[1]] == 0 \
                 and self.state[(self.agent_pos[0]+2), self.agent_pos[1]] == 0 and \
                self.state[(self.agent_pos[0]+3), self.agent_pos[1]] == 0:
                    self.state[self.agent_pos[0], self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+1), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+2), self.agent_pos[1]] = 1
                    self.state[(self.agent_pos[0]+3), self.agent_pos[1]] = 1
                    self.valid_booking()
                    self.agent_pos = [(self.agent_pos[0]+3), self.agent_pos[1]]
                else:
                    #print('quad taken')
                    self.invalid_booking()
            else:
                #print('not for quad')
                self.invalid_booking()

        next_state = self.state

        return next_state

    def step(self, action):

        #get new position of agent based on action
        new_agent_pos = self.move_agent(action)
        #print('new and old pos', new_agent_pos, self.agent_pos)
        
        #if the agent is stuck on an edge then move to a new position
        if new_agent_pos == self.agent_pos:
            self.agent_pos = [np.random.randint(self.num_slots), np.random.randint(self.num_gps)]
            #print('here1', self.agent_pos)
        else:
            self.agent_pos = new_agent_pos
            #print('here2', self.agent_pos)
        
        #print('trying to book', self.to_book, self.appt_idx)
        
        #check if it's possible to book then book
        if self.check_bookable():
            #print('checked here')
            self.state = self.check_and_book()
        else:
            #print('not bookable')
            self.invalid_booking()
        
        #work out if episode complete
        if self.appt_idx == len(self.to_book):
            #print('all booked')
            self.done = True
  
        #work out rewards
        #self.reward = (1 - (self.reward_decay**self.decay_steps))
        
        #print('step', self.decay_steps, self.reward)
        #print('end step')

        info = {}
        return self.state, self.reward, self.done, info

In [None]:
#device = "cuda"
device = "cpu"

env = SchedulerEnv()

#start writing to tensorboard
writer = SummaryWriter(comment="Scheduler DQN")

#create the current network and target network
policy_model = Model(env.observation_space.shape, env.action_space.n).to(device)

target_model = Model(env.observation_space.shape, env.action_space.n).to(device)
target_model.load_state_dict(policy_model.state_dict())

optimizer = optim.Adam(policy_model.parameters(), lr=0.001, eps=1e-3)

          # Exploration rate    
replay_buffer = deque(maxlen=1000)

step_idx = 0
epsilon = eps_start

for i in range(num_rounds):
    #change this for while not true once it works
    episode_reward = 0

    for j in range(num_episodes):
        
        step_idx += 1
        #print(i,j,step_idx)
        state = env.reset()

        #epsilon for epsilon greedy strategy  
        if epsilon > eps_min:
            epsilon *= eps_decay
            
        #print('epsilon', epsilon)
            
        check = policy_model(state)
        #print(check)

        # Select and perform an action
        if step_idx > learning_limit:
            if np.random.rand() > epsilon:
                action = torch.argmax(policy_model(state))
        else:
            action = np.random.randint(env.action_space.n)

        next_state, reward, done, _ = env.step(action)
        reward = torch.tensor([reward], device=device)
        episode_reward += reward
        #print('here rewards', episode_reward, reward, step_idx)

        # Store other info in replay buffer
        replay_buffer.append((state, action, reward, next_state, done))
        #print('len buffer', len(replay_buffer))

        # Move to the next state
        state = next_state

        if done:
            break
            
        #print('stopped episode', j)
        
    writer.add_scalar('episode_reward', episode_reward, step_idx)

    #once we're ready to learn then start learning with mini batches
    if len(replay_buffer) == replay_limit:
        print('replay buffer')
        minibatch = random.sample(replay_buffer, batch_size)

        for state, action, reward, next_state, done in minibatch:    
            optimizer.zero_grad()
            #pass state to policy to get qval from policy
            pred_qval = policy_model(state)

            #pass next state to target policy to get next set of qvals (future gains)
            if not done:
                next_qval = (reward + (gamma * max(target_model(next_state)))).detach()
                #next_qval = next_qval.detach()
            else:
                next_qval = reward    

            loss = F.mse_loss(pred_qval, next_qval)
            print('loss', loss)
            loss.backward()

            optimizer.step()
    print('step', step_idx)

    # Update the target network, copying all weights and biases in DQN
    # Periodically update the target network by Q network to target Q network
    if step_idx % weight_update == 0:
        print('update weights')
        # Update weights of target
        target_model.load_state_dict(policy_model.state_dict())

writer.close()

len buffer 1
len buffer 2
len buffer 3
len buffer 4
len buffer 5
len buffer 6
len buffer 7
len buffer 8
len buffer 9
len buffer 10
len buffer 11
len buffer 12
len buffer 13
len buffer 14
len buffer 15
len buffer 16
len buffer 17
len buffer 18
len buffer 19
len buffer 20
len buffer 21
len buffer 22
len buffer 23
len buffer 24
len buffer 25
len buffer 26
len buffer 27
len buffer 28
len buffer 29
len buffer 30
len buffer 31
len buffer 32
len buffer 33
len buffer 34
len buffer 35
len buffer 36
len buffer 37
len buffer 38
len buffer 39
len buffer 40
len buffer 41
len buffer 42
len buffer 43
len buffer 44
len buffer 45
len buffer 46
len buffer 47
len buffer 48
len buffer 49
len buffer 50
step 50
len buffer 51
len buffer 52
len buffer 53
len buffer 54
len buffer 55
len buffer 56
len buffer 57
len buffer 58
len buffer 59
len buffer 60
len buffer 61
len buffer 62
len buffer 63
len buffer 64
len buffer 65
len buffer 66
len buffer 67
len buffer 68
len buffer 69
len buffer 70
len buffer 71
len buf

len buffer 554
len buffer 555
len buffer 556
len buffer 557
len buffer 558
len buffer 559
len buffer 560
len buffer 561
len buffer 562
len buffer 563
len buffer 564
len buffer 565
len buffer 566
len buffer 567
len buffer 568
len buffer 569
len buffer 570
len buffer 571
len buffer 572
len buffer 573
len buffer 574
len buffer 575
len buffer 576
len buffer 577
len buffer 578
len buffer 579
len buffer 580
len buffer 581
len buffer 582
len buffer 583
len buffer 584
len buffer 585
len buffer 586
len buffer 587
len buffer 588
len buffer 589
len buffer 590
len buffer 591
len buffer 592
len buffer 593
len buffer 594
len buffer 595
len buffer 596
len buffer 597
len buffer 598
len buffer 599
len buffer 600
step 600
len buffer 601
len buffer 602
len buffer 603
len buffer 604
len buffer 605
len buffer 606
len buffer 607
len buffer 608
len buffer 609
len buffer 610
len buffer 611
len buffer 612
len buffer 613
len buffer 614
len buffer 615
len buffer 616
len buffer 617
len buffer 618
len buffer 619
l



len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.0786, grad_fn=<MseLossBackward>)
loss tensor(0.9298, grad_fn=<MseLossBackward>)
loss tensor(1.0779, grad_fn=<MseLossBackward>)
loss tensor(1.0860, grad_fn=<MseLossBackward>)
loss tensor(1.1322, grad_fn=<MseLossBackward>)
loss tensor(1.0187, grad_fn=<MseLossBackward>)


len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.5293, grad_fn=<MseLossBackward>)
loss tensor(1.6880, grad_fn=<MseLossBackward>)
loss tensor(0.3908, grad_fn=<MseLossBackward>)
loss tensor(0.4981, grad_fn=<MseLossBackward>)
loss tensor(0.4577, grad_fn=<MseLossBackward>)
loss tensor(0.4865, grad_fn=<MseLossBackward>)
loss tensor(1.89

loss tensor(0.6025, grad_fn=<MseLossBackward>)
loss tensor(1.5119, grad_fn=<MseLossBackward>)
loss tensor(0.6500, grad_fn=<MseLossBackward>)
loss tensor(0.5688, grad_fn=<MseLossBackward>)
loss tensor(0.6583, grad_fn=<MseLossBackward>)
loss tensor(1.3647, grad_fn=<MseLossBackward>)
loss tensor(1.3495, grad_fn=<MseLossBackward>)
loss tensor(0.6043, grad_fn=<MseLossBackward>)
loss tensor(0.6526, grad_fn=<MseLossBackward>)
loss tensor(0.6213, grad_fn=<MseLossBackward>)
loss tensor(0.6150, grad_fn=<MseLossBackward>)
loss tensor(0.5543, grad_fn=<MseLossBackward>)
loss tensor(0.6115, grad_fn=<MseLossBackward>)
loss tensor(0.4478, grad_fn=<MseLossBackward>)
loss tensor(0.3931, grad_fn=<MseLossBackward>)
loss tensor(0.5030, grad_fn=<MseLossBackward>)
loss tensor(0.5538, grad_fn=<MseLossBackward>)
loss tensor(0.5223, grad_fn=<MseLossBackward>)
loss tensor(0.3965, grad_fn=<MseLossBackward>)
step 1400
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.4275, grad_fn=<MseLossBackward>)
loss tensor(0.5922, grad_fn=<MseLossBackward>)
loss tensor(1.3455, grad_fn=<MseLossBackward>)
loss tensor(0.7348, grad_fn=<MseLossBackward>)
loss tensor(0.7183, grad_fn=<MseLossBackward>)
loss tensor(0.8330, grad_fn=<MseLossBackward>)
loss tensor(1.3865, grad_fn=<MseLossBackward>)
loss tensor(1.0635, grad_fn=<MseLossBackward>)
loss tensor(1.3122, grad_fn=<MseLossBackward>)
loss tensor(0.6924, grad_fn=<MseLossBackward>)
loss tensor(0.7070, grad_fn=<MseLossBackward>)
loss tensor(0.6758, grad_fn=<MseLossBackward>)
loss tensor(0.7142, grad_fn=<MseLossBackward>)
loss tensor(0.6984, grad_fn=<MseLossBackward>)
loss tensor(0.6107, grad_fn=<MseLossBackward>)
loss tensor(0.5205, grad_fn=<MseLossBackw

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.8776, grad_fn=<MseLossBackward>)
loss tensor(1.0979, grad_fn=<MseLossBackward>)
loss tensor(0.9321, grad_fn=<MseLossBackward>)
loss tensor(0.8728, grad_fn=<MseLossBackward>)
loss tensor(0.9659, grad_fn=<MseLossBackward>)
loss tensor(0.6321, grad_fn=<MseLossBackward>)
loss tensor(1.2197, grad_fn=<MseLossBackward>)
loss tensor(0.8630, grad_fn=<MseL

loss tensor(0.5955, grad_fn=<MseLossBackward>)
loss tensor(1.2789, grad_fn=<MseLossBackward>)
loss tensor(0.5960, grad_fn=<MseLossBackward>)
loss tensor(1.8307, grad_fn=<MseLossBackward>)
loss tensor(1.5133, grad_fn=<MseLossBackward>)
loss tensor(0.4012, grad_fn=<MseLossBackward>)
loss tensor(1.6988, grad_fn=<MseLossBackward>)
loss tensor(0.3949, grad_fn=<MseLossBackward>)
step 1950
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len bu

loss tensor(1.0946, grad_fn=<MseLossBackward>)
loss tensor(1.3877, grad_fn=<MseLossBackward>)
loss tensor(0.6462, grad_fn=<MseLossBackward>)
loss tensor(0.6820, grad_fn=<MseLossBackward>)
loss tensor(0.9869, grad_fn=<MseLossBackward>)
loss tensor(1.2901, grad_fn=<MseLossBackward>)
loss tensor(0.9705, grad_fn=<MseLossBackward>)
loss tensor(0.7659, grad_fn=<MseLossBackward>)
loss tensor(0.5616, grad_fn=<MseLossBackward>)
step 2150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buf

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.3712, grad_fn=<MseLossBackward>)
loss tensor(1.1595, grad_fn=<MseLossBackward>)
loss tensor(0.6770, grad_fn=<MseLossBackward>)
loss tensor(1.2319, grad_fn=<MseLossBackward>)
loss tensor(0.6761, grad_fn=<MseLossBackward>)
loss tensor(1.1414, grad_fn=<MseLossBackward>)


len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7178, grad_fn=<MseLossBackward>)
loss tensor(0.6433, grad_fn=<MseLossBackward>)
loss tensor(0.8961, grad_fn=<MseLossBackward>)
loss tensor(0.6425, grad_fn=<MseLossBackward>

loss tensor(0.8852, grad_fn=<MseLossBackward>)
loss tensor(0.8319, grad_fn=<MseLossBackward>)
loss tensor(0.8829, grad_fn=<MseLossBackward>)
loss tensor(1.1233, grad_fn=<MseLossBackward>)
loss tensor(0.6096, grad_fn=<MseLossBackward>)
loss tensor(1.7874, grad_fn=<MseLossBackward>)
step 2750
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len 

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.8229, grad_fn=<MseLossBackward>)
loss tensor(0.2151, grad_fn=<MseLossBackward>)
loss tensor(1.8426, grad_fn=<MseLossBackward>)
loss tensor(0.4877, grad_fn=<MseLossBackward>)
loss tensor(0.8461, grad_fn=<MseLossBackward>)
loss tensor(0.4716, grad_fn=<Ms

loss tensor(1.1539, grad_fn=<MseLossBackward>)
loss tensor(0.8552, grad_fn=<MseLossBackward>)
loss tensor(0.7260, grad_fn=<MseLossBackward>)
loss tensor(0.7990, grad_fn=<MseLossBackward>)
step 3150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
re

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.0010, grad_fn=<MseLossBackward>)
loss tensor(2.7533, grad_fn=<MseLossBackward>)
loss tensor(2.2772, grad_fn=<MseLossBackward>)
loss tensor(0.0211, grad_fn=<MseLossBackward>)
loss tensor(0.0582, grad_fn=<MseLossBackward>)
loss tensor(2.7093, grad_fn=<MseLossBackward>)


len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.1663, grad_fn=<MseLossBackward>)
loss tensor(2.1896, grad_fn=<MseLossBackward>)
loss tensor(0.2291, grad_fn=<MseLossBackward>)
loss tensor(1.2662, grad_fn=<MseLossBackward>)
loss tensor(0.

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.5684, grad_fn=<MseLossBackward>)
loss tensor(0.3401, grad_fn=<MseLossBackward>)
loss tensor(2.3171, grad_fn=<MseLossBackward>)
loss tensor(1.8057, grad_fn=<MseLossBackward>)
loss tensor(0.2883, grad_fn=<MseLossBackward>)
loss tensor(0.0185, grad_fn=<MseLossBackward>)
loss tensor(0.0407, grad_fn=<Mse

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.8053, grad_fn=<MseLossBackward>)
loss tensor(0.8448, grad_fn=<MseLossBackward>)
loss tensor(1.2959, grad_fn=<MseLossBackward>)
loss tensor(0.1530, grad_fn=<MseLossBackward>)
loss tensor(2.2621, grad_fn=<MseLossBackward>)
loss tensor(0.2928, grad_fn=<MseLossBackward>)
loss tensor(0.3070, grad_fn=<Mse

loss tensor(1.0164, grad_fn=<MseLossBackward>)
loss tensor(0.9369, grad_fn=<MseLossBackward>)
loss tensor(1.0313, grad_fn=<MseLossBackward>)
loss tensor(1.2847, grad_fn=<MseLossBackward>)
loss tensor(0.8378, grad_fn=<MseLossBackward>)
loss tensor(1.2290, grad_fn=<MseLossBackward>)
loss tensor(1.2067, grad_fn=<MseLossBackward>)
loss tensor(1.2903, grad_fn=<MseLossBackward>)
step 4150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len bu

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.8527, grad_fn=<MseLossBackward>)
loss tensor(0.9442, grad_fn=<MseLossBackward>)
loss tensor(0.9700, grad_fn=<MseLossBackward>)
loss tensor(0.6493, grad_fn=<MseLossBackward>)
loss tensor(1.0752, grad_fn=<MseLossBackward>)
loss tensor(1.1

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.4643, grad_fn=<MseLossBackward>)
loss tensor(1.7464, grad_fn=<MseLossBackward>)
loss tensor(0.4945, grad_fn=<MseLossBackward>)
loss tensor(1.3137, grad_fn=<MseLossBackward>

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.3245, grad_fn=<MseLossBackward>)
loss tensor(0.8241, grad_fn=<MseLossBackward>)
loss tensor(0.5859, grad_fn=<MseLossBackward>)
loss tensor(0.7601, grad_fn=<MseLossBackward>)
loss tensor(1.1005, grad_fn=<MseLossBackward>)
loss tensor(0.3

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.8980, grad_fn=<MseLossBackward>)
loss tensor(1.4315, grad_fn=<MseLossBackward>)
loss tensor(0.6663, grad_fn=<MseLossBackward>)
loss tensor(0.8861, grad_fn=<MseLossBackward>)
loss tensor(1.

loss tensor(1.1715, grad_fn=<MseLossBackward>)
loss tensor(0.9095, grad_fn=<MseLossBackward>)
loss tensor(1.0663, grad_fn=<MseLossBackward>)
step 5150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.8907, grad_fn=<MseLos

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.1244, grad_fn=<MseLossBackward>)
loss tensor(0.8826, grad_fn=<MseLossBackward>)
loss tensor(0.8033, grad_fn=<MseLossBackward>)
loss tensor(0.5561, grad_fn=<MseLossBackward>)
loss tensor(1.5166, grad_fn=<MseLossBackward>)
loss tensor(0.5788, grad_fn=<Ms

step 5550
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7243, grad_fn=<MseLossBackward>)
loss tensor(0.8075, grad_fn=<MseLossBackward>)
loss tensor(0.5658, grad_fn=<MseLossBackward>)
loss tensor(0.4655, grad_fn=<MseLos

loss tensor(2.0978, grad_fn=<MseLossBackward>)
loss tensor(0.2820, grad_fn=<MseLossBackward>)
loss tensor(0.2399, grad_fn=<MseLossBackward>)
loss tensor(0.4196, grad_fn=<MseLossBackward>)
loss tensor(0.1592, grad_fn=<MseLossBackward>)
loss tensor(0.1673, grad_fn=<MseLossBackward>)
step 5750
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len 

loss tensor(0.1992, grad_fn=<MseLossBackward>)
loss tensor(0.1436, grad_fn=<MseLossBackward>)
loss tensor(0.6929, grad_fn=<MseLossBackward>)
loss tensor(0.2736, grad_fn=<MseLossBackward>)
loss tensor(0.1571, grad_fn=<MseLossBackward>)
loss tensor(0.2286, grad_fn=<MseLossBackward>)
loss tensor(2.3073, grad_fn=<MseLossBackward>)
loss tensor(0.1295, grad_fn=<MseLossBackward>)
loss tensor(2.0924, grad_fn=<MseLossBackward>)
loss tensor(0.4206, grad_fn=<MseLossBackward>)
loss tensor(2.4618, grad_fn=<MseLossBackward>)
loss tensor(2.3559, grad_fn=<MseLossBackward>)
loss tensor(0.1740, grad_fn=<MseLossBackward>)
loss tensor(0.5643, grad_fn=<MseLossBackward>)
loss tensor(2.1804, grad_fn=<MseLossBackward>)
loss tensor(2.6799, grad_fn=<MseLossBackward>)
loss tensor(0.1517, grad_fn=<MseLossBackward>)
loss tensor(1.8412, grad_fn=<MseLossBackward>)
loss tensor(1.3630, grad_fn=<MseLossBackward>)
loss tensor(0.3119, grad_fn=<MseLossBackward>)
loss tensor(0.4429, grad_fn=<MseLossBackward>)
step 5950
len

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.6349, grad_fn=<MseLossBackward>)
loss tensor(0.6559, grad_fn=<MseLossBackward>)
loss tensor(1.4206, grad_fn=<MseLossBackward>)
loss tensor(0.8926, grad_fn=<MseLossBackward>)
loss tensor(1.3018, grad_fn=<MseLossBackward>)
loss tensor(0.8608, grad_fn=<MseLossBackward>)
loss tensor(0.8265, grad_fn=<MseLossBackward>)
loss tensor(0.9100, grad_fn=<MseLossBackward>)
loss tensor(1.2200, grad_fn=<MseLossBackward>)
loss tensor(0.8757, grad_fn=<MseLossBackward>)
loss tensor(0.7681, grad_fn=<MseLossBackward>)
loss tensor(0.6664, grad_fn=<MseLossBackward>)
loss tensor(0.6064, grad_fn=<MseLossBa

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2443, grad_fn=<MseLossBackward>)
loss tensor(0.2543, grad_fn=<MseLossBackward>)
loss tensor(1.9517, grad_fn=<MseLossBackward>)
loss tensor(2.0217, grad_fn=<MseLossBackward>)
loss tensor(2.4669, grad_fn=<MseLossBackward>)
loss tensor(0.4961, grad_fn=<MseLossBackward>)
loss tensor(0.00

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.7072, grad_fn=<MseLossBackward>)
loss tensor(0.2506, grad_fn=<MseLossBackward>)
loss tensor(0.1644, grad_fn=<MseLossBackward>)
loss tensor(1.4153, grad_fn=<MseLossBackward>

loss tensor(1.2892, grad_fn=<MseLossBackward>)
loss tensor(0.8856, grad_fn=<MseLossBackward>)
loss tensor(1.2380, grad_fn=<MseLossBackward>)
loss tensor(0.5983, grad_fn=<MseLossBackward>)
loss tensor(1.0314, grad_fn=<MseLossBackward>)
loss tensor(0.6462, grad_fn=<MseLossBackward>)
loss tensor(1.6572, grad_fn=<MseLossBackward>)
loss tensor(0.5647, grad_fn=<MseLossBackward>)
loss tensor(0.3841, grad_fn=<MseLossBackward>)
loss tensor(1.0350, grad_fn=<MseLossBackward>)
loss tensor(0.9869, grad_fn=<MseLossBackward>)
loss tensor(0.7915, grad_fn=<MseLossBackward>)
loss tensor(0.6661, grad_fn=<MseLossBackward>)
step 6700
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7010, grad_fn=<MseLossBackward>)
loss tensor(0.4670, grad_fn=<MseLossBackward>)
loss tensor(0.8982, grad_fn=<MseLossBackward>)
loss tensor(0.5869, grad_fn=<MseLossBackward>)
loss tensor(1.1919, grad_fn=<MseLossBackward>)
loss tensor(0.5627, grad_fn=<MseLossBackward>)
loss tensor(0.3635, grad_fn=<MseLossBackward>)
loss tensor(0.8226, grad_fn=<MseLossBackward>)
loss tensor(0.8696, grad_fn=<MseLossBackward>)
loss tensor(0.3077, grad_fn=<MseLossBackward>)
loss tensor(1.6240, grad_fn=<MseLossBackward>)
loss tensor(0.7581, grad_fn=<MseLossBackward>)
loss tensor(0.0879, grad_fn=<MseLossBackward>)
loss tensor(0.5000, grad_fn=<MseLossBackward>)
loss tensor(0.2593, grad_fn=<MseLossBackward>)
loss tensor(1.9582, grad_fn=<MseLossBackward>)
loss tensor(0.1931, grad_f

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2467, grad_fn=<MseLossBackward>)
loss tensor(2.3308, grad_fn=<MseLossBackward>)
loss tensor(0.1265, grad_fn=<MseLossBackward>)
loss tensor(2.5153, grad_fn=<MseLossBackward>

step 7250
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.5868, grad_fn=<MseLossBackward>)
loss tensor(0.4727, grad_fn=<MseLossBackward>)
loss tensor(0.4245, grad_fn=<MseLossBackward>)
loss tensor(0.2645, grad_fn=<MseLos

loss tensor(0.3242, grad_fn=<MseLossBackward>)
loss tensor(0.8494, grad_fn=<MseLossBackward>)
step 7450
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.0598, grad_fn=<MseLossBackward>)
loss tensor(2.1241, grad_fn=<MseLos

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.3800, grad_fn=<MseLossBackward>)
loss tensor(0.8836, grad_fn=<MseLossBackward>)
loss tensor(0.6459, grad_fn=<MseLossBackward>)
loss tensor(0.2314, grad_fn=<MseLossBackward>)
loss tensor(0.3195, grad_fn=<MseLossBackward>)
loss tensor(0.6201, grad_fn=<MseLossBackward>)
loss tensor(0.3004, grad_fn=<Mse

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7223, grad_fn=<MseLossBackward>)
loss tensor(1.9839, grad_fn=<MseLossBackward>)
loss tensor(0.2187, grad_fn=<MseLossBackward>)
loss tensor(0.0760, grad_fn=<MseLossBackward>)
loss tensor(0.5194, grad_fn=<MseLossBackward>)
loss tensor(1.6548, grad_fn=<MseLossBackward>)
loss tensor(2.2210, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.3389, grad_fn=<MseLossBackward>)
loss tensor(0.3011, grad_fn=<MseLossBackward>)
loss tensor(0.2194, grad_fn=<MseLossBackward>)
loss tensor(0.4077, grad_fn=<MseLossBackward>)
loss tensor(0.1253, grad_fn=<MseLossBackward>)
loss tensor(2.3989, grad_fn=<MseLossBackward>)
loss tensor(3.0001, grad_fn=<MseLossBackward>)
loss tensor(3.030

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.5376, grad_fn=<MseLossBackward>)
loss tensor(1.4228, grad_fn=<MseLossBackward>)
loss tensor(1.3063, grad_fn=<MseLossBackward>)
loss tensor(0.7572, grad_fn=<MseLossBackward>)
loss tensor(0.8177, grad_fn=<MseLossBackward>)
loss tensor(1.4261, grad_fn=<MseLossBackward>)
loss tensor(0.9652, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2320, grad_fn=<MseLossBackward>)
loss tensor(0.4851, grad_fn=<MseLossBackward>)
loss tensor(0.2953, grad_fn=<MseLossBackward>)
loss tensor(1.4151, grad_fn=<MseLossBackward>)
loss tensor(1.8585, grad_fn=<MseLossBackward>)
loss tensor(1.0058, grad_fn=<Ms

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7389, grad_fn=<MseLossBackward>)
loss tensor(0.6600, grad_fn=<MseLossBackward>)
loss tensor(0.6679, grad_fn=<MseLossBackward>)
loss tensor(0.8745, grad_fn=<MseLossBackward>

loss tensor(0.4156, grad_fn=<MseLossBackward>)
loss tensor(0.6891, grad_fn=<MseLossBackward>)
loss tensor(0.0724, grad_fn=<MseLossBackward>)
loss tensor(1.2077, grad_fn=<MseLossBackward>)
loss tensor(1.1022, grad_fn=<MseLossBackward>)
loss tensor(0.9465, grad_fn=<MseLossBackward>)
loss tensor(1.1268, grad_fn=<MseLossBackward>)
loss tensor(0.6132, grad_fn=<MseLossBackward>)
loss tensor(0.6787, grad_fn=<MseLossBackward>)
loss tensor(1.0530, grad_fn=<MseLossBackward>)
loss tensor(0.9233, grad_fn=<MseLossBackward>)
loss tensor(1.6426, grad_fn=<MseLossBackward>)
loss tensor(0.5151, grad_fn=<MseLossBackward>)
loss tensor(1.6895, grad_fn=<MseLossBackward>)
loss tensor(0.6643, grad_fn=<MseLossBackward>)
loss tensor(0.3346, grad_fn=<MseLossBackward>)
loss tensor(1.5343, grad_fn=<MseLossBackward>)
loss tensor(1.9292, grad_fn=<MseLossBackward>)
loss tensor(0.7673, grad_fn=<MseLossBackward>)
step 8850
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.0147, grad_fn=<MseLossBackward>)
loss tensor(0.9632, grad_fn=<MseLossBackward>)
loss tensor(1.0777, grad_fn=<MseLossBackward>)
loss tensor(1.1055, grad_fn=<MseLossBackward>)
loss tensor(0.8635, grad_fn=<MseLossBackward>)
loss tensor(1.1056, grad_fn=<MseLossBackward>)
loss tensor(0.8631, grad_fn=<MseLossBackward>)
loss tensor(0.8515, grad_fn=<MseLossBackward>)
loss tensor(0.8003, grad_fn=<MseLossBackward>)
loss tensor(1.2100, grad_fn=<MseLossBackward>)
loss tensor(1.3074, grad_fn=<MseLossBackward>)
loss tensor(0.8242, grad_fn=<MseLossBackward>)
loss tensor(0.7599, grad_fn=<MseLossBackward>)
loss tensor(1.2527, grad_fn=<MseLossBackward>)
loss tensor(1.2431, grad_fn=<MseLossBackward>)
loss tensor(1.2277, grad_

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.2699, grad_fn=<MseLossBackward>)
loss tensor(0.4419, grad_fn=<MseLossBackward>)
loss tensor(0.6643, grad_fn=<MseLossBackward>)
loss tensor(0.6396, grad_fn=<MseLossBackward>)
loss tensor(1.3091, grad_fn=<MseLossBackward>)
loss tensor(0.7421, grad_fn=<MseLossBackward>)
loss tensor(1.4505, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.2151, grad_fn=<MseLossBackward>)
loss tensor(0.7507, grad_fn=<MseLossBackward>)
loss tensor(0.2338, grad_fn=<MseLossBackward>)
loss tensor(1.4706, grad_fn=<MseLossBackward>)
loss tensor(0.6922, grad_fn=<MseLossBackward>)

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2764, grad_fn=<MseLossBackward>)
loss tensor(0.4240, grad_fn=<MseLossBackward>)
loss tensor(2.4785, grad_fn=<MseLossBackward>)
loss tensor(0.2244, grad_fn=<MseLossBackward>)
loss tensor(0.0663, grad_fn=<MseLossBackward>)
loss tensor(0.5653, grad_fn=<MseLossBackward>)
loss tensor(0.4352, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.1392, grad_fn=<MseLossBackward>)
loss tensor(0.3559, grad_fn=<MseLossBackward>)
loss tensor(1.1687, grad_fn=<MseLossBackward>)
loss tensor(0.4190, grad_fn=<MseLossBackward>

loss tensor(0.2202, grad_fn=<MseLossBackward>)
loss tensor(1.7994, grad_fn=<MseLossBackward>)
loss tensor(0.0045, grad_fn=<MseLossBackward>)
loss tensor(1.8244, grad_fn=<MseLossBackward>)
loss tensor(0.6914, grad_fn=<MseLossBackward>)
loss tensor(0.5534, grad_fn=<MseLossBackward>)
loss tensor(1.4954, grad_fn=<MseLossBackward>)
loss tensor(0.6166, grad_fn=<MseLossBackward>)
loss tensor(0.4964, grad_fn=<MseLossBackward>)
loss tensor(1.0671, grad_fn=<MseLossBackward>)
loss tensor(0.9705, grad_fn=<MseLossBackward>)
loss tensor(0.5076, grad_fn=<MseLossBackward>)
loss tensor(1.6974, grad_fn=<MseLossBackward>)
loss tensor(0.6553, grad_fn=<MseLossBackward>)
loss tensor(1.5329, grad_fn=<MseLossBackward>)
loss tensor(0.8469, grad_fn=<MseLossBackward>)
loss tensor(0.8861, grad_fn=<MseLossBackward>)
loss tensor(0.6252, grad_fn=<MseLossBackward>)
loss tensor(0.9804, grad_fn=<MseLossBackward>)
loss tensor(0.4431, grad_fn=<MseLossBackward>)
loss tensor(0.8310, grad_fn=<MseLossBackward>)
step 10000
up

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.1443, grad_fn=<MseLossBackward>)
loss tensor(1.1900, grad_fn=<MseLossBackward>)
loss tensor(0.9864, grad_fn=<MseLossBackward>)
loss tensor(1.0092, grad_fn=<MseLossBackward>)
loss tensor(0.8236, grad_fn=<MseLossBackward>)
loss tensor(1.1398, grad_fn=<MseLossBackward>)
loss tensor(0.8762, grad_fn=<MseLossBackward>)
loss tensor(1.1853, grad_fn=<MseLossBackward>)
loss tensor(0.6237, grad_fn=<MseLossBackward>)
loss tensor(0.7475, grad_fn=<MseLossBackward>)
loss tensor(1.4596, grad_fn=<MseLossBackward>)
loss tensor(0.9123, grad_fn=<MseLossBackward>)
loss tensor(0.8712, grad_fn=<MseLossBackward>)
loss tensor(1.5080, grad_fn=<MseLossBac

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7971, grad_fn=<MseLossBackward>)
loss tensor(0.5485, grad_fn=<MseLossBackward>)
loss tensor(1.3436, grad_fn=<MseLossBackward>)
loss tensor(0.5133, grad_fn=<MseLossBackward>)
loss tensor(0.5165, grad_fn=<MseLossBackward>)
loss tensor(0.3292, grad_fn=<MseLossBackward>)
loss tensor(0.0015, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.4513, grad_fn=<MseLossBackward>)
loss tensor(0.2668, grad_fn=<MseLossBackward>)
loss tensor(0.6992, grad_fn=<MseLossBackward>)
loss tensor(1.0738, grad_fn=<MseLossBackward>)
loss tensor(1.0772, grad_fn=<MseLossBackward>)
loss tensor(1.1604, grad_fn=<MseLossBackward>)


loss tensor(0.8714, grad_fn=<MseLossBackward>)
loss tensor(0.8381, grad_fn=<MseLossBackward>)
loss tensor(0.5410, grad_fn=<MseLossBackward>)
loss tensor(0.6764, grad_fn=<MseLossBackward>)
loss tensor(0.6033, grad_fn=<MseLossBackward>)
step 10750
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
le

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.2559, grad_fn=<MseLossBackward>)
loss tensor(0.6865, grad_fn=<MseLossBackward>)
loss tensor(0.6216, grad_fn=<MseLossBackward>)
loss tensor(0.2528, grad_fn=<MseLossBackward>)
loss tensor(0.2184, grad_fn=<MseLossBackward>)
loss tensor(1.8201, grad_fn=<MseLossBackward>)


len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2580, grad_fn=<MseLossBackward>)
loss tensor(0.2355, grad_fn=<MseLossBackward>)
loss tensor(2.1605, grad_fn=<MseLossBackward>)
loss tensor(0.1490, grad_fn=<MseLossBackward>)
loss tensor(0.0039, grad_fn=<MseLossBackward>)
loss tensor(2.8671, grad_fn=<MseLossBackward>)


len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.6907, grad_fn=<MseLossBackward>)
loss tensor(0.8474, grad_fn=<MseLossBackward>)
loss tensor(0.5106, grad_fn=<MseLossBackward>)
loss tensor(1.3284, grad_fn=<MseLossBackward>)
loss tensor(1.2447, grad_fn=<MseLossBackward>)
loss tensor(0.6526, grad_fn=<MseLossBackward>)
loss tensor(1.2626, grad_fn=<MseLossBackward>)
l

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.6341, grad_fn=<MseLossBackward>)
loss tensor(0.4365, grad_fn=<MseLossBackward>)
loss tensor(0.9532, grad_fn=<MseLossBackward>)
loss tensor(0.5761, grad_fn=<MseLossBackward>)
loss tensor(0.3850, grad_fn=<MseLossBackward>)
loss tensor(0.4760, grad_fn=<MseLossBackward>)
loss tensor(2.17

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.9183, grad_fn=<MseLossBackward>)
loss tensor(0.7080, grad_fn=<MseLossBackward>)
loss tensor(0.9327, grad_fn=<MseLossBackward>)
loss tensor(1.1665, grad_fn=<MseLossBackward>)
loss tensor(1.2457, grad_fn=<MseLossBackward>)
loss tensor(1.1257, grad_fn=<MseLossBackward>)
loss tensor(0.2625, grad_fn=<Mse

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7991, grad_fn=<MseLossBackward>)
loss tensor(1.0163, grad_fn=<MseLossBackward>)
loss tensor(0.2015, grad_fn=<MseLossBackward>)
loss tensor(0.6274, grad_fn=<MseLossBackward>)
loss tensor(0.4378, grad_fn=<MseLossBackward>)
loss tensor(0.4928, grad_fn=<Ms

loss tensor(2.6126, grad_fn=<MseLossBackward>)
loss tensor(2.8172, grad_fn=<MseLossBackward>)
loss tensor(0.1282, grad_fn=<MseLossBackward>)
loss tensor(0.0259, grad_fn=<MseLossBackward>)
loss tensor(0.1729, grad_fn=<MseLossBackward>)
loss tensor(0.0556, grad_fn=<MseLossBackward>)
step 12150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len

loss tensor(0.7797, grad_fn=<MseLossBackward>)
loss tensor(0.5168, grad_fn=<MseLossBackward>)
loss tensor(0.9880, grad_fn=<MseLossBackward>)
step 12350
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.3291, grad_fn=<MseLo

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.2363, grad_fn=<MseLossBackward>)
loss tensor(1.1259, grad_fn=<MseLossBackward>)
loss tensor(2.4854, grad_fn=<MseLossBackward>)
loss tensor(2.9158, grad_fn=<MseLossBackward>)
loss tensor(0.5277, grad_fn=<M

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.6872, grad_fn=<MseLossBackward>)
loss tensor(0.4377, grad_fn=<MseLossBackward>)
loss tensor(0.5335, grad_fn=<MseLossBackward>)
loss tensor(0.7944, grad_fn=<MseLossBackward>)
loss tensor(0.6457, grad_fn=<MseLossBackward>)

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.2340, grad_fn=<MseLossBackward>)
loss tensor(1.3243, grad_fn=<MseLossBackward>)
loss tensor(0.4448, grad_fn=<MseLossBackward>)
loss tensor(0.7546, grad_fn=<MseLossBackward>)
loss tensor(0.

loss tensor(0.1889, grad_fn=<MseLossBackward>)
loss tensor(2.8079, grad_fn=<MseLossBackward>)
loss tensor(3.3299, grad_fn=<MseLossBackward>)
loss tensor(0.1328, grad_fn=<MseLossBackward>)
loss tensor(2.5174, grad_fn=<MseLossBackward>)
loss tensor(0.1969, grad_fn=<MseLossBackward>)
loss tensor(2.9552, grad_fn=<MseLossBackward>)
loss tensor(1.7203, grad_fn=<MseLossBackward>)
loss tensor(2.2700, grad_fn=<MseLossBackward>)
loss tensor(0.1638, grad_fn=<MseLossBackward>)
loss tensor(0.3791, grad_fn=<MseLossBackward>)
loss tensor(1.6422, grad_fn=<MseLossBackward>)
loss tensor(0.3912, grad_fn=<MseLossBackward>)
loss tensor(0.4734, grad_fn=<MseLossBackward>)
loss tensor(0.6719, grad_fn=<MseLossBackward>)
loss tensor(1.3315, grad_fn=<MseLossBackward>)
step 13150
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 10

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.9041, grad_fn=<MseLossBackward>)
loss tensor(0.9484, grad_fn=<MseLossBackward>)
loss tensor(1.2448, grad_fn=<MseLossBackward>)
loss tensor(0.7524, grad_fn=<MseLossBackward>)
loss tensor(1.0548, grad_fn=<MseLossBackward>)
loss tensor(1.2404, grad_fn=<MseLossBackward>)
loss tensor(0.8980, grad_fn=<MseLossBackward>)
loss tensor(1.1652, grad_fn=<MseLossBackward>)
loss tensor(0.9054, grad_fn=<MseLossBackward>)
loss tensor(0.9344, grad_fn=<MseLossBackward>)
loss tensor(1.2592, grad_fn=<MseLossBackward>)
loss tensor(0.8931, grad_fn=<MseLossBackward>)
loss tensor(0.7752, grad_fn=<MseLossBackward>)
loss tensor(0.2756, grad_fn=<MseLossBackward>)
loss tensor(0.4815, grad_fn=<MseLossBackward>)
loss tens

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(1.0396, grad_fn=<MseLossBackward>)
loss tensor(1.1347, grad_fn=<MseLossBackward>)
loss tensor(1.6253, grad_fn=<MseLossBackward>)
loss tensor(1.2703, grad_fn=<MseLossBackward>)
loss tensor(0.9340, grad_fn=<MseLossBackward>)
loss tensor(0.9521, grad_fn=<MseLossBackward>)
loss tensor(1.10

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.3145, grad_fn=<MseLossBackward>)
loss tensor(0.8546, grad_fn=<MseLossBackward>)
loss tensor(0.5815, grad_fn=<MseLossBackward>)
loss tensor(0.6605, grad_fn=<MseLossBackward>)
loss tensor(0.6276, grad_fn=<MseLossBackward>)
loss tensor(0.8419, grad_fn=<MseLossBackward>)
loss tensor(0.65

len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
len buffer 1000
replay buffer
loss tensor(0.7240, grad_fn=<MseLossBackward>)
loss tensor(0.1116, grad_fn=<MseLossBackward>)
loss tensor(1.8190, grad_fn=<MseLossBackward>)
loss tensor(2.6523, grad_fn=<MseLossBackward>)
loss tensor(0.4355, grad_fn=<MseLossBackward>)
loss tensor(0.8163, grad_fn=<Ms

In [None]:
check

In [None]:
torch.argmax(check)

In [None]:
np.random.randint(5, size=(2, 4))