In [15]:
import os
import sys
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)
os.environ["CUDA_VISIBLE_DEVICES"] = "3,4,5"
from typing import Tuple, List, Optional
    
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

import copy
import pickle
from tqdm.notebook import tqdm

import torch
from scipy.integrate import odeint
from scipy.special import erf

from llmICL.data.serialize import serialize_arr, deserialize_str, SerializerSettings
from llmICL.models.llama import get_model_and_tokenizer
from llmICL.models.ICL import MultiResolutionPDF, recursive_refiner, trim_kv_cache

import gymnasium as gym

# Helper functions (original)

In [16]:
def serialize_gaussian(prec, time_series, mean_series, sigma_series):
    """
    Serialize a time series with gaussian noise and continuous support.

    Parameters:
    prec (int): Precision of the serialization
    time_series (list): The time series data
    mean_series (list): The mean series data
    sigma_series (list): The sigma series data

    Returns:
    tuple: A tuple containing 
        serialized time series: str
        rescaled mean series: np array
        rescaled sigma series: np array
    """
    settings=SerializerSettings(base=10, prec=prec, signed=True, time_sep=',', bit_sep='', minus_sign='-', fixed_length=False, max_val = 10)
    time_series = np.array(time_series)
    ### Final range is from 0.15 to 0.85
    rescale_factor = 7.0
    up_shift = 1.5

    rescaled_array = (time_series-time_series.min())/(time_series.max()-time_series.min()) * rescale_factor + up_shift
    rescaled_true_mean_arr = (np.array(mean_series)-time_series.min())/(time_series.max()-time_series.min()) * rescale_factor + up_shift
    rescaled_true_sigma_arr = np.array(sigma_series)/(time_series.max()-time_series.min()) * rescale_factor 
    # rescaled_true_mean_arr *= 10
    # rescaled_true_sigma_arr *= 10
    full_series = serialize_arr(rescaled_array, settings)
    return (full_series, rescaled_true_mean_arr, rescaled_true_sigma_arr)

def calculate_multiPDF(full_series, prec, tokenizer, model, mode = 'neighbor', refine_depth = 1, llama_size = '13b'):
    '''
     This function calculates the multi-resolution probability density function (PDF) for a given series.

     Parameters:
     full_series (str): The series for which the PDF is to be calculated.
     prec (int): The precision of the PDF.
     mode (str, optional): The mode of calculation. Defaults to 'neighbor'.
     refine_depth (int, optional): The depth of refinement for the PDF. Defaults to 1.
     llama_size (str, optional): The size of the llama model. Defaults to '13b'.

     Returns:
     list: A list of PDFs for the series.
    '''
    good_tokens_str = list("0123456789")
    good_tokens = [tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
    assert refine_depth < prec, "Refine depth must be less than precision"
    refine_depth = refine_depth - prec
    curr = -prec
    batch = tokenizer(
        [full_series], 
        return_tensors="pt",
        add_special_tokens=True        
    )
    torch.cuda.empty_cache()
    with torch.no_grad():
        out = model(batch['input_ids'].cuda(), use_cache=True)
    logit_mat = out['logits']
    kv_cache_main = out['past_key_values']
    print(f"{logit_mat.shape} {type(logit_mat)}")
    
    
    # ------- This is strangely slow ----------
    # logit_mat_good = logit_mat[:,:,good_tokens]
    # ---------------------------------------
    
    # --------- a faster alternative ---------
    logit_mat_good = []
    for gt in good_tokens:
        logit_mat_good.append(logit_mat[:,:,gt])
    logit_mat_good = torch.stack(logit_mat_good, axis=-1)
    # ----------------------------------------
    
    probs = torch.nn.functional.softmax(logit_mat_good[:,1:,:], dim=-1)
    
    PDF_list = []
    comma_locations = np.sort(np.where(np.array(list(full_series)) == ',')[0])

    # for i in range(len(comma_locations)-1, len(comma_locations)):
    i = len(comma_locations)-1
    PDF = MultiResolutionPDF()
    # slice out the number before ith comma
    if i == 0:
        start_idx = 0
    else:
        start_idx = comma_locations[i-1]+1
    end_idx = comma_locations[i]

    print(f"start_idx:end_idx -> {start_idx}:{end_idx}")
    
    num_slice = full_series[start_idx:end_idx]

    prob_slice = probs[0,start_idx:end_idx]
    prob_slice = prob_slice.detach().cpu().numpy()
    # prob_slice = prob_slice.cpu()
    # prob_slice = prob_slice.numpy()
   
    # np_prob_slice = np.ones(prob_slice.shape)
    # for i_prob in range(np_prob_slice.shape[0]):
    #     for j_prob in range(np_prob_slice.shape[1]):
    #         prob_slice_cp = prob_slice[i_prob, j_prob].item()  # .detach().cpu().numpy()
    #         np_prob_slice[i_prob, j_prob] = prob_slice_cp  #.item()
    # prob_slice = np_prob_slice
    
    print(f"prob_slice: {prob_slice.shape}, type: {type(prob_slice)}")
    print(f"num_slice: {num_slice}, type: {type(num_slice)}")
    
    ### Load hierarchical PDF 
    PDF.load_from_num_prob(num_slice, prob_slice)
    
    ### Refine hierarchical PDF
    seq = full_series[:end_idx]
    # cache and full_series are shifted from beginning, not end
    end_idx_neg = end_idx - len(full_series)
    ### kv cache contains seq[0:-1]
    kv_cache = trim_kv_cache(kv_cache_main, end_idx_neg-1)
    recursive_refiner(PDF, seq, curr = curr, main = True, refine_depth = refine_depth, mode = mode, 
                    kv_cache = kv_cache, model = model, tokenizer = tokenizer, good_tokens=good_tokens)

    PDF_list += [PDF]
    # end of for loop
        
    # release memory
    del logit_mat, kv_cache_main
    return PDF_list

def calculate_multiPDF_llama3(
    full_series, tokenizer, model, lookahead=1, temperature=1.0, number_of_tokens_original=None
):
    '''
     This function calculates the multi-resolution probability density function (PDF) for a given series.

     Parameters:
     full_series (str): The series for which the PDF is to be calculated.
     prec (int): The precision of the PDF.
     mode (str, optional): The mode of calculation. Defaults to 'neighbor'.
     refine_depth (int, optional): The depth of refinement for the PDF. Defaults to 1.
     llama_size (str, optional): The size of the llama model. Defaults to '13b'.
        
     Returns:
     list: A list of PDFs for the series.
    '''
    good_tokens_str = []
    for num in range(1000):
        good_tokens_str.append(str(num))
    good_tokens = [tokenizer.convert_tokens_to_ids(token) for token in good_tokens_str]
    batch = tokenizer(
        [full_series],
        return_tensors="pt",
        add_special_tokens=True        
    )
    
    torch.cuda.empty_cache()
    
    with torch.no_grad():
        out = model(batch['input_ids'].cuda(), use_cache=True)
    logit_mat = out['logits']
    kv_cache_main = out['past_key_values']
    
    logit_mat_good = logit_mat[:,:,good_tokens].clone()
    if number_of_tokens_original:
        probs = torch.nn.functional.softmax(logit_mat_good[:,-(number_of_tokens_original-1):,:] / temperature, dim=-1)
    else:
        probs = torch.nn.functional.softmax(logit_mat_good[:,1:,:] / temperature, dim=-1)
    
    PDF_list = []
    
    for i in range(1,int(probs.shape[1]),2):
        PDF = MultiResolutionPDF()
        PDF.bin_center_arr = np.arange(0,1000) / 100
        PDF.bin_width_arr = np.array(1000*[0.01])
        PDF.bin_height_arr = probs[0,i,:].cpu().numpy() * 100

        PDF_list.append(PDF)

    # sample next value: mode of the distribution
    next_value_idx = np.argmax(probs[0,-1,:].flatten())
    next_value = good_tokens_str[next_value_idx]

    mode_values = []
    mode_values.append(next_value)

    for _ in tqdm(range(lookahead-1)):
        batch = tokenizer(
            [f",{next_value}"],
            return_tensors="pt",
            add_special_tokens=True        
        )
        with torch.no_grad():
            out = model(batch['input_ids'].cuda(), use_cache=True, past_key_values = kv_cache_main)
        logit_mat = out['logits']
        kv_cache_main = out['past_key_values']

        logit_mat_good = logit_mat[:,:,good_tokens].clone()
        if number_of_tokens_original:
            probs = torch.nn.functional.softmax(logit_mat_good[:,-(number_of_tokens_original-1):,:] / temperature, dim=-1)
        else:
            probs = torch.nn.functional.softmax(logit_mat_good[:,1:,:] / temperature, dim=-1)
        
        # TODO: check if probs is of length the full sequence or just the last token
        
        PDF = MultiResolutionPDF()
        PDF.bin_center_arr = np.arange(0,1000) / 100
        PDF.bin_width_arr = np.array(1000*[0.01])
        PDF.bin_height_arr = probs[0,-1,:].cpu().numpy() * 100
        
        PDF_list.append(PDF)
        
        next_value_idx = np.argmax(probs[0,-1,:].flatten())
        next_value = good_tokens_str[next_value_idx]
        mode_values.append(next_value)

    # release memory
    del logit_mat, kv_cache_main
    return PDF_list, mode_values

def compute_statistics(full_series, PDF_list, rescaled_true_mean_arr, rescaled_true_sigma_arr):
    PDF_list = PDF_list[0]
    PDF_true_list = copy.deepcopy(PDF_list)
    discrete_BT_loss = []
    discrete_KL_loss = []
    for PDF, PDF_true, true_mean, true_sigma in zip(PDF_list, PDF_true_list, rescaled_true_mean_arr, rescaled_true_sigma_arr):
        def cdf(x):
            return 0.5 * (1 + erf((x - true_mean) / (true_sigma * np.sqrt(2))))
        
        PDF_true.discretize(cdf, mode = "cdf")
        PDF_true.compute_stats()
        discrete_BT_loss += [PDF_true.BT_dist(PDF)]    
        discrete_KL_loss += [PDF_true.KL_div(PDF)]
    
    discrete_BT_loss = np.array(discrete_BT_loss)
    discrete_KL_loss = np.array(discrete_KL_loss)
    
    ### Extract statistics from MultiResolutionPDF
    
    mean_arr = []
    mode_arr = []
    sigma_arr = []
    moment_3_arr = []
    moment_4_arr = []
    
    num_commas = full_series.count(',')
    # for comma_idx in range(num_commas):
    PDF_list[0].compute_stats()
    mean, mode, sigma = PDF_list[0].mean, PDF_list[0].mode, PDF_list[0].sigma 
    moment_3 = PDF_list[0].compute_moment(3)
    moment_4 = PDF_list[0].compute_moment(4)
    
    mean_arr.append(mean)
    mode_arr.append(mode)
    sigma_arr.append(sigma)
    moment_3_arr.append(moment_3)
    moment_4_arr.append(moment_4)
    # end of for loop
    
    # Convert lists to numpy arrays
    mean_arr = np.array(mean_arr)
    mode_arr = np.array(mode_arr)
    sigma_arr = np.array(sigma_arr)
    moment_3_arr = np.array(moment_3_arr)
    moment_4_arr = np.array(moment_4_arr)
    kurtosis_arr = moment_4_arr / sigma_arr**4
    kurtosis_error = (kurtosis_arr-3)**2
    
    error_mean = np.abs(rescaled_true_mean_arr - mean_arr)
    error_mode = np.abs(rescaled_true_mean_arr - mode_arr)
    return mean_arr, mode_arr, sigma_arr, moment_3_arr, moment_4_arr

# Generate RL trajectory

In [9]:
####################################
### Generate continuous time series
####################################

### Halfcheetah + random policy

N = 100 # number of steps

env = gym.make('HalfCheetah')

seed = 7
env.np_random.__setstate__(np.random.default_rng(seed).__getstate__())

obs, _ = env.reset()

observations = [obs]
actions = []
rewards = []
terminateds = []
truncateds = []
# Generate the episode
for t in range(1, N):
    action = env.action_space.sample()
    next_obs, reward, terminated, truncated, info = env.step(action)
    observations.append(next_obs)
    actions.append(action)
    rewards.append(reward)
    terminateds.append(terminated)
    truncateds.append(truncated)

  logger.warn(


# Main function: predict next state

In [10]:
OBSERVATION = np.array
ACTION = np.array
REWARD = np.array
TERMINATED = np.array
TRUNCATED = np.array

TRAJECTORY = Tuple[List[OBSERVATION], List[ACTION], List[REWARD], List[TERMINATED], List[TRUNCATED]]

def in_context_mbrl_sequential(trajectory: TRAJECTORY, model, tokenizer, prec: int = 3, verbose: int = 1) -> OBSERVATION:
    """Docstring.
    """

    observations, actions, rewards, terminateds, truncateds = trajectory
    observations = np.stack(observations, axis=0)
    actions = np.stack(actions, axis=0)
    rewards = np.stack(rewards, axis=0)
    
    n_obs = observations.shape[1]
    n_action = actions.shape[1]

    prediction_dims = []

    if verbose:
        pbar = tqdm(total=n_obs)
    for obs_dim in range(n_obs):
        time_series = observations[:, obs_dim]
        mean_series = copy.copy(time_series)
        std_series = 0.1*np.ones_like(mean_series)
        
        # -------------------- Rescaling + n-digits encoding --------------------
        full_series, rescaled_true_mean_arr, rescaled_true_sigma_arr = serialize_gaussian(prec, time_series, mean_series, std_series)

        # -------------------- Compute predictions --------------------
        PDF_list = calculate_multiPDF_llama3(full_series, tokenizer=tokenizer, model=model)

        # -------------------- Compute statistics of next state --------------------
        mean_arr, mode_arr, sigma_arr, moment_3_arr, moment_4_arr = compute_statistics(
            full_series=full_series, PDF_list=PDF_list, rescaled_true_mean_arr=rescaled_true_mean_arr, rescaled_true_sigma_arr=rescaled_true_sigma_arr
        )

        prediction_dims.append(mode_arr[-1])

        if verbose:
            pbar.update(1)
    if verbose:
        pbar.close()
    
    prediction = np.array(prediction_dims)
    return prediction 

def in_context_mbrl_autoregressive(trajectory: TRAJECTORY, model, tokenizer, prec: int = 3, verbose: int = 1) -> OBSERVATION:
    """Docstring.
    """

    observations, actions, rewards, terminateds, truncateds = trajectory
    observations = np.stack(observations, axis=0)
    actions = np.stack(actions, axis=0)
    rewards = np.stack(rewards, axis=0)
    
    n_obs = observations.shape[1]
    n_action = actions.shape[1]

    prediction_dims = []

    time_series = np.concatenate([observations, actions, rewards], axis=1).reshape(-1)
    mean_series = copy.copy(time_series)
    std_series = 0.1*np.ones_like(mean_series)
    
    # -------------------- Rescaling + n-digits encoding --------------------
    full_series, rescaled_true_mean_arr, rescaled_true_sigma_arr = serialize_gaussian(prec, time_series, mean_series, std_series)

    # -------------------- Compute predictions --------------------
    PDF_list, mode_values = calculate_multiPDF_llama3(full_series, tokenizer=tokenizer, model=model, lookahead=17)

    # -------------------- Compute statistics of next state --------------------
    # mean_arr, mode_arr, sigma_arr, moment_3_arr, moment_4_arr = compute_statistics(
    #     full_series=full_series, PDF_list=PDF_list, rescaled_true_mean_arr=rescaled_true_mean_arr, rescaled_true_sigma_arr=rescaled_true_sigma_arr
    # )

    # prediction_dims.append(mode_arr[-1])
    
    # prediction = np.array(prediction_dims)
    return np.array(mode_values)

In [11]:
np.array([[1,2,3],[4,5,6],[7,8,9]]).reshape(-1)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [31]:
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [34]:
prec=3
time_series = np.stack(observations, axis=0)[:, 0]
mean_series = copy.copy(time_series)
std_series = 0.1*np.ones_like(mean_series)

# pre_prompt = "predict the next element of this series of observations:"
pre_prompt = ""

# -------------------- Rescaling + n-digits encoding --------------------
full_series, rescaled_true_mean_arr, rescaled_true_sigma_arr = serialize_gaussian(prec, time_series, mean_series, std_series)

# model, tokenizer = get_model_and_tokenizer('7b')

# %lprun -f in_context_mbrl in_context_mbrl(trajectory=(observations, actions, rewards, terminateds, truncateds), model=model, tokenizer=tokenizer)
%lprun -f calculate_multiPDF calculate_multiPDF(pre_prompt+full_series, tokenizer=tokenizer, model=model, prec=prec, mode='neighbor', refine_depth=1, llama_size='7b')

torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
start_idx:end_idx -> 495:499
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
num_slice: 8214, type: <class 'str'>


Timer unit: 1e-09 s

Total time: 8.26761 s
File: /tmp/ipykernel_1307365/1050521022.py
Function: calculate_multiPDF at line 31

Line #      Hits         Time  Per Hit   % Time  Line Contents
    31                                           def calculate_multiPDF(full_series, prec, tokenizer, model, mode = 'neighbor', refine_depth = 1, llama_size = '13b'):
    32                                               '''
    33                                                This function calculates the multi-resolution probability density function (PDF) for a given series.
    34                                           
    35                                                Parameters:
    36                                                full_series (str): The series for which the PDF is to be calculated.
    37                                                prec (int): The precision of the PDF.
    38                                                mode (str, optional): The mode of calculation.

# Gym-like Environment

In [12]:
import numpy as np
import random

from gymnasium.envs.mujoco.half_cheetah_v4 import HalfCheetahEnv
import gymnasium as gym
from stable_baselines3 import SAC
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import ConvertCallback

In [13]:
class ReplayBuffer:
    def __init__(self, max_size=1e6):
        self.buffer = []
        self.max_size = int(max_size)
        self.current_size = 0

    def add_episode(self, episode: TRAJECTORY):
        # Each episode is a tuple of (observations, actions, rewards, dones)
        if self.current_size < self.max_size:
            self.buffer.append(episode)
            self.current_size += 1
        else:
            # Replace an old episode randomly
            self.buffer.append(episode)
            self.buffer.pop(0)

    def sample(self):
        # Randomly sample an episode
        episode = random.choice(self.buffer)
        observations, actions, rewards, terminateds, truncateds = episode
        
        # Randomly choose a start index
        max_start_index = len(observations) - 100
        if max_start_index <= 0:
            return episode  # Return the entire episode if less than 100 steps
        
        start_index = random.randint(0, max_start_index)
        end_index = start_index + 100
        
        return (observations[start_index:end_index],
                actions[start_index:end_index],
                rewards[start_index:end_index],
                terminateds[start_index:end_index],
                truncateds[start_index:end_index]
               )

class LLMReplayWrapper(HalfCheetahEnv):
    def __init__(self, max_episode_length=10):
        super(LLMReplayWrapper).__init__()
        self.replay_buffer = ReplayBuffer()
        self.current_trajectory = None
        self.max_episode_length = max_episode_length

        self.observation_space = HalfCheetahEnv().observation_space
        self.action_space = HalfCheetahEnv().action_space

        self.verbose = 1

        # -------------------- Load LLM --------------------
        self.model, self.tokenizer = get_model_and_tokenizer('7b')

    def add_trajectory(self, trajectory: TRAJECTORY):
        self.replay_buffer.add_episode(trajectory)
    
    def reset(self, seed=0):
        # Sample a trajectory of size 100
        self.current_trajectory = self.replay_buffer.sample()
        self.current_index = 0

        # Return the first observation of the sampled trajectory
        obs, _, _, _, _ = self.current_trajectory
        return obs[-1], {} 

    def step(self, action):
        if self.current_trajectory is None:
            raise ValueError("Environment is not reset. Call reset() before step().")
        
        observations, actions, rewards, terminateds, truncateds = self.current_trajectory
        actions.append(action)
        
        # Append the new action to the current trajectory
        self.current_index += 1

        # Use the in_context_mbrl function to predict the next observation
        input_trajectory = observations, actions, rewards, terminateds, truncateds
        next_obs = in_context_mbrl_sequential(
            trajectory=input_trajectory, 
            model=self.model, 
            tokenizer=self.tokenizer, 
            prec=3,
            verbose=self.verbose,
        )  # Assuming precision 'prec' is set here
        reward = self._compute_reward(next_obs)
        
        terminated = False
        truncated = False
        if self.current_index == self.max_episode_length:
            truncated = True
        
        # Append the predicted observation to the trajectory
        observations.append(next_obs)
        rewards.append(reward)  # Reward is 0 for now
        terminateds.append(terminated)  # Done is False for now
        truncateds.append(truncated)
        self.current_trajectory = observations, actions, rewards, terminateds, truncateds
        
        # Get the statistics for the info dict from your environment's prediction mechanism
        info = {
            'mean': np.mean(next_obs),  # Example statistic
            'mode': np.median(next_obs),  # Assume mode is calculated or approximated
            'std': np.std(next_obs)
        }
        
        return next_obs, reward, terminated, truncated, info

    def _compute_reward(self, observation):
        return 0.0

# MBRL-like loop

In [14]:
from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback
from stable_baselines3.common.type_aliases import TrainFreq

class ModifiedSAC(SAC):
    def learn(
        self,
        total_timesteps: int,
        callback: MaybeCallback = None,
        log_interval: int = 4,
        tb_log_name: str = "run",
        reset_num_timesteps: bool = True,
        progress_bar: bool = False,
    ) -> "ModifiedSAC":
        total_timesteps, callback = self._setup_learn(
            total_timesteps,
            callback,
            reset_num_timesteps,
            tb_log_name,
            progress_bar,
        )

        callback.on_training_start(locals(), globals())

        assert self.env is not None, "You must set the environment before calling learn()"
        assert isinstance(self.train_freq, TrainFreq)  # check done in _setup_learn()

        pbar = tqdm(total=total_timesteps)
        while self.num_timesteps < total_timesteps:
            old_num_timesteps = self.num_timesteps
            rollout = self.collect_rollouts(
                self.env,
                train_freq=self.train_freq,
                action_noise=self.action_noise,
                callback=callback,
                learning_starts=self.learning_starts,
                replay_buffer=self.replay_buffer,
                log_interval=log_interval,
            )

            if not rollout.continue_training:
                break
            
            if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts:
                # If no `gradient_steps` is specified,
                # do as many gradients steps as steps performed during the rollout
                gradient_steps = self.gradient_steps if self.gradient_steps >= 0 else rollout.episode_timesteps
                # Special case when the user passes `gradient_steps=0`
                if gradient_steps > 0:
                    self.train(batch_size=self.batch_size, gradient_steps=gradient_steps)
            
            pbar.update(self.num_timesteps-old_num_timesteps)
        pbar.close()
        
        callback.on_training_end()
        
        return self

In [8]:
def main_loop():
    # Training parameters
    n_episodes_real = 1
    
    # Initialize the real Gym environment and model-based environment wrapper
    env = make_vec_env('HalfCheetah')
    llm_env = make_vec_env(LLMReplayWrapper)
    
    # Initialize the SAC algorithm
    sac = ModifiedSAC(
        "MlpPolicy", 
        env,
        buffer_size=int(1e5),
        learning_starts=int(1e2),
        batch_size=256,
        train_freq=1,
        gradient_steps=1,
        verbose=2,
    )
    
    # List to store returns from real environment interactions
    real_env_returns = []
    
    # Perform learning loop
    for iteration in range(n_episodes_real):
        # Interact with the real environment
        if iteration > 0:
            sac.learning_starts = 1
        else:
            sac.learn(
                total_timesteps=int(1e3),
                log_interval=1,
            )
        # sac.collect_rollouts(
        #     env=env,
        #     callback=ConvertCallback(None),
        #     train_freq=sac.train_freq,
        #     replay_buffer=sac.replay_buffer,
        #     action_noise=sac.action_noise,
        #     learning_starts=sac.learning_starts,
        # )
        
        # extract newly added episode from sac replay buffer
        print(f"pos in replay buffer: {sac.replay_buffer.pos}")
        to_add_obs = [sac.replay_buffer.observations[i].flatten() for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
        to_add_act = [sac.replay_buffer.actions[i].flatten() for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
        to_add_rew = [float(sac.replay_buffer.rewards[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
        to_add_don = [int(sac.replay_buffer.dones[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
        
        # episode_rewards = sum(env.get_attr('episode_rewards'))  # Retrieve episode rewards after rollouts
        # real_env_returns.append(episode_rewards)
        
        # add real env trajectory to LLM replay buffer
        llm_env.envs[0].add_trajectory(trajectory=(to_add_obs, to_add_act, to_add_rew, to_add_don, to_add_don))  # dummy for now
        llm_env.envs[0].verbose=1
    
        # Now interact with the LLM-based environment
    
        # change environment
        sac.env = llm_env
        sac.learning_starts = 1
        sac.learn(
            total_timesteps=int(1e3),
            log_interval=1,
        )
        
        # episode_rewards = sum(env.get_attr('episode_rewards'))  # Retrieve episode rewards after rollouts
        # real_env_returns.append(episode_rewards)
    
    print("Recorded Returns from Real Environment:", real_env_returns)

In [None]:
main_loop()

  logger.warn(
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Using cuda device


  0%|          | 0/1000 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -318     |
| time/              |          |
|    episodes        | 1        |
|    fps             | 73       |
|    time_elapsed    | 13       |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | -16.4    |
|    critic_loss     | 0.498    |
|    ent_coef        | 0.764    |
|    ent_coef_loss   | -2.69    |
|    learning_rate   | 0.0003   |
|    n_updates       | 899      |
---------------------------------
pos in replay buffer: 1000


  to_add_rew = [float(sac.replay_buffer.rewards[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
  to_add_don = [int(sac.replay_buffer.dones[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
  logger.warn(


  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  log_ratio = np.log(self.bin_height_arr) - np.log(Multi_PDF.bin_height_arr)
  weighted_log_ratio = log_ratio * self.bin_height_arr * self.bin_width_arr


  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 1        |
|    fps             | 0        |
|    time_elapsed    | 154      |
|    total_timesteps | 10       |
| train/             |          |
|    actor_loss      | -16.5    |
|    critic_loss     | 0.853    |
|    ent_coef        | 0.762    |
|    ent_coef_loss   | -2.72    |
|    learning_rate   | 0.0003   |
|    n_updates       | 908      |
---------------------------------


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 2        |
|    fps             | 0        |
|    time_elapsed    | 308      |
|    total_timesteps | 20       |
| train/             |          |
|    actor_loss      | -16.5    |
|    critic_loss     | 1.6      |
|    ent_coef        | 0.759    |
|    ent_coef_loss   | -2.75    |
|    learning_rate   | 0.0003   |
|    n_updates       | 918      |
---------------------------------


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 3        |
|    fps             | 0        |
|    time_elapsed    | 462      |
|    total_timesteps | 30       |
| train/             |          |
|    actor_loss      | -16.6    |
|    critic_loss     | 0.828    |
|    ent_coef        | 0.757    |
|    ent_coef_loss   | -2.79    |
|    learning_rate   | 0.0003   |
|    n_updates       | 928      |
---------------------------------


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/153 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/154 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/157 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/162 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

  0%|          | 0/163 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 10       |
|    ep_rew_mean     | 0        |
| time/              |          |
|    episodes        | 4        |
|    fps             | 0        |
|    time_elapsed    | 616      |
|    total_timesteps | 40       |
| train/             |          |
|    actor_loss      | -16.7    |
|    critic_loss     | 0.79     |
|    ent_coef        | 0.755    |
|    ent_coef_loss   | -2.81    |
|    learning_rate   | 0.0003   |
|    n_updates       | 938      |
---------------------------------


  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/150 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/151 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

In [None]:
%lprun -f main_loop main_loop()

  logger.warn(


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



Using cuda device


  0%|          | 0/1000 [00:00<?, ?it/s]

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1e+03    |
|    ep_rew_mean     | -255     |
| time/              |          |
|    episodes        | 1        |
|    fps             | 74       |
|    time_elapsed    | 13       |
|    total_timesteps | 1000     |
| train/             |          |
|    actor_loss      | -16.8    |
|    critic_loss     | 0.547    |
|    ent_coef        | 0.764    |
|    ent_coef_loss   | -2.69    |
|    learning_rate   | 0.0003   |
|    n_updates       | 899      |
---------------------------------
pos in replay buffer: 1000


  to_add_rew = [float(sac.replay_buffer.rewards[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
  to_add_don = [int(sac.replay_buffer.dones[i].flatten()) for i in range(sac.replay_buffer.pos-1000, sac.replay_buffer.pos)]
  logger.warn(


  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>


  log_ratio = np.log(self.bin_height_arr) - np.log(Multi_PDF.bin_height_arr)
  weighted_log_ratio = log_ratio * self.bin_height_arr * self.bin_width_arr


torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 502, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 501, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 507, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 506, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 512, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 511, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 517, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 516, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 522, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 521, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 527, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 526, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 532, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 531, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 537, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 536, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5

  0%|          | 0/17 [00:00<?, ?it/s]

torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 542, 32000]) <class 'torch.Tensor'>
probs: torch.Size([1, 541, 10]), type: <class 'torch.Tensor'>
prob_slice: (4, 10), type: <class 'numpy.ndarray'>
torch.Size([1, 5