In [1]:
import torch
import gym
import torch
import gym
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from torch.distributions import Normal
import scipy.signal
eps = np.finfo(np.float32).eps.item()
#torch.manual_seed(100)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def discount_cumsum(x, discount):
    """
    magic from rllab for computing discounted cumulative sums of vectors.
    input: 
        vector x, 
        [x0, 
         x1, 
         x2]
    output:
        [x0 + discount * x1 + discount^2 * x2,  
         x1 + discount * x2,
         x2]
    """
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]

In [3]:
class ValueNetwork(nn.Module):
    def __init__(self, obs_dimension, sizes, act = nn.ReLU):
        super(ValueNetwork, self).__init__()
        sizes = [obs_dimension] + sizes + [1]
        out_activation = nn.Identity
        self.layers = []
        for j in range(0,len(sizes) - 1):
            act_l = act if j < len(sizes) -2 else out_activation
            self.layers+=[nn.Linear(sizes[j], sizes[j+1]), act_l()]
        self.v = nn.Sequential(*self.layers)
    def forward(self, x):
        return self.v(x)

In [4]:
class PolicyNetworkCat(nn.Module):
    def __init__(self, obs_dimension, sizes, action_dimension, act= nn.ReLU):
        super(PolicyNetworkCat, self).__init__()
        sizes = [obs_dimension] + sizes + [action_dimension]
        out_activation = nn.Identity
        self.layers = []
        for j in range(0,len(sizes) - 1):
            act_l = act if j < len(sizes) -2 else out_activation
            self.layers+=[nn.Linear(sizes[j], sizes[j+1]), act_l()]
        self.pi = nn.Sequential(*self.layers)
    def forward(self, x):
        score = self.pi(x)
        #probs = F.softmax(score,dim = 1)
        dist = torch.distributions.Categorical(logits=score)
        return dist

In [5]:
class PolicyNetworkGauss(nn.Module):
    def __init__(self, obs_dimension, sizes, action_dimension):
        super(PolicyNetworkGauss, self).__init__()
        sizes = [obs_dimension] + sizes + [action_dimension]
        act = nn.Tanh
        out_activation = nn.Identity
        self.layers = []
        for j in range(0,len(sizes) - 1):
            act_l = act if j < len(sizes) -2 else out_activation
            self.layers+=[nn.Linear(sizes[j], sizes[j+1]), act_l()]
        self.mu = nn.Sequential(*self.layers)
        log_std = -0.5*np.ones(action_dimension, dtype=np.float32)
        self.log_std = torch.nn.Parameter(torch.as_tensor(log_std))
    def forward(self, x):
        mean = self.mu(x)
        std = torch.exp(self.log_std)
        dist = Normal(mean, std)
        return dist

In [6]:
env = gym.make('CartPole-v1')
sizes = [128]
obs_dimension = env.observation_space.shape
action_dimension = env.action_space.shape
print(obs_dimension)
print(env.action_space.n)
v = ValueNetwork(*obs_dimension, sizes)
pi = PolicyNetworkGauss(*obs_dimension, sizes, 2)

(4,)
2


In [7]:
class PPOBuffer:
    """
    A buffer for storing trajectories experienced by a PPO agent interacting
    with the environment, and using Generalized Advantage Estimation (GAE-Lambda)
    for calculating the advantages of state-action pairs.
    """

    def __init__(self, obs_dim, act_dim, size, gamma=0.99, lam=0.95):
        self.obs_buf = np.zeros((size, obs_dim), dtype=np.float32)
        #self.act_buf = np.zeros((size,), dtype=np.float32)
        self.act_buf = np.zeros((size,act_dim), dtype=np.float32)
        self.adv_buf = np.zeros(size, dtype=np.float32)
        self.rew_buf = np.zeros(size, dtype=np.float32)
        self.ret_buf = np.zeros(size, dtype=np.float32)
        self.val_buf = np.zeros(size, dtype=np.float32)
        self.logp_buf = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.ptr, self.path_start_idx, self.max_size = 0, 0, size

    def store(self, obs, act, rew, val, logp):
        """
        Append one timestep of agent-environment interaction to the buffer.
        """
        assert self.ptr < self.max_size     # buffer has to have room so you can store
        self.obs_buf[self.ptr] = obs
        self.act_buf[self.ptr] = act
        self.rew_buf[self.ptr] = rew
        self.val_buf[self.ptr] = val
        self.logp_buf[self.ptr] = logp
        self.ptr += 1

    def finish_path(self, last_val=0):
        """
        Call this at the end of a trajectory, or when one gets cut off
        by an epoch ending. This looks back in the buffer to where the
        trajectory started, and uses rewards and value estimates from
        the whole trajectory to compute advantage estimates with GAE-Lambda,
        as well as compute the rewards-to-go for each state, to use as
        the targets for the value function.

        The "last_val" argument should be 0 if the trajectory ended
        because the agent reached a terminal state (died), and otherwise
        should be V(s_T), the value function estimated for the last state.
        This allows us to bootstrap the reward-to-go calculation to account
        for timesteps beyond the arbitrary episode horizon (or epoch cutoff).
        """

        path_slice = slice(self.path_start_idx, self.ptr)
        rews = np.append(self.rew_buf[path_slice], last_val)
        vals = np.append(self.val_buf[path_slice], last_val)
       # print(rews)
       # print(vals)
        # the next two lines implement GAE-Lambda advantage calculation
        deltas = rews[:-1] + self.gamma * vals[1:] - vals[:-1]
        self.adv_buf[path_slice] = discount_cumsum(deltas, self.gamma * self.lam)
        
        # the next line computes rewards-to-go, to be targets for the value function
        self.ret_buf[path_slice] = discount_cumsum(rews, self.gamma)[:-1]
        
        self.path_start_idx = self.ptr

    def get(self):
        """
        Call this at the end of an epoch to get all of the data from
        the buffer, with advantages appropriately normalized (shifted to have
        mean zero and std one). Also, resets some pointers in the buffer.
        """
        assert self.ptr == self.max_size    # buffer has to be full before you can get
        self.ptr, self.path_start_idx = 0, 0
        # the next two lines implement the advantage normalization trick

        self.adv_buf = (self.adv_buf - self.adv_buf.mean()) / self.adv_buf.std()
        data = dict(obs=self.obs_buf, act=self.act_buf, ret=self.ret_buf,
                    adv=self.adv_buf, logp=self.logp_buf)
        return {k: torch.as_tensor(v, dtype=torch.float32) for k,v in data.items()}


In [8]:
def ppo(env, seed = 0, buff_size = 4000, train_time_steps = 1000000, gamma = 0.99, clip_ratio = 0.2, lr_pi = 1e-3, 
        lr_vf = 1e-3, pi_train_itrs = 80, v_train_itrs = 80, lam = 0.97, max_ep_len = 500):
        obs_dim = env.observation_space.shape
        #action_dim = 2
        action_dim = env.action_space.shape
        h_sizes = [64,64]
        vi = ValueNetwork(*obs_dim, h_sizes).to(device)
        #pi = PolicyNetworkCat(*obs_dim, h_sizes, action_dim).to(device)
        pi = PolicyNetworkGauss(*obs_dim, h_sizes, action_dim).to(device)
        data_buff = PPOBuffer(*obs_dim, action_dim, buff_size)
        policy_opt = optim.Adam(pi.parameters(), lr = lr_pi)
        value_opt = optim.Adam(vi.parameters(), lr = lr_vf)
        obs = env.reset()
        curr_time_step = 0
        pbar = tqdm(total = train_time_steps)
        num_episode = 0
        ep_rewards = [0]
        while curr_time_step < train_time_steps: 
                for t in range(0, buff_size):
                        with torch.no_grad():
                                m = pi(torch.as_tensor(obs, dtype=torch.float32).to(device))
                                action = m.sample()
                                action = action.cpu().numpy() 
                                clipped_action = np.clip(env.action_space.low, env.action_space.high)
                                logp = m.log_prob(a)
                                #obs_new, rew, done, _ = env.step(a.item())
                                obs_new, rew, done, _ = env.step(clipped_action)
                                ep_rewards[num_episode]+=rew 
                                v = vi(torch.as_tensor(obs, dtype=torch.float32).to(device))
                        data_buff.store(obs, action, rew, v.cpu().numpy(), logp.cpu().numpy())
                        obs = obs_new
                        if done or t == buff_size-1:
                                if done:
                                        v_ = 0.
                                        obs = env.reset()
                                        done = False
                                        num_episode+=1
                                        ep_rewards.append(0)
                                        if num_episode %100 == 0:
                                                print(f'episode: {num_episode-1} \t episode_reward: {np.mean(ep_rewards[-10:-2])} \t total steps:{curr_time_step}')
                                else:
                                        v_ = vi(torch.as_tensor(obs, dtype=torch.float32).to(device))
                                        v_ = v_.detach().cpu().numpy()
                                data_buff.finish_path(v_)
                        curr_time_step+=1
                        pbar.update(1)
                data = data_buff.get()
                ret, act, adv, o, logp_old= data['ret'].to(device), data['act'].to(device), data['adv'].to(device), data['obs'].to(device), data['logp'].to(device)
                for j in range(0, pi_train_itrs):
                        act_dist = pi(o)
                        logp = act_dist.log_prob(act)
                        ratio = torch.exp(logp - logp_old)
                        clip_adv = torch.clamp(ratio, 1-clip_ratio, 1+clip_ratio) * adv
                        loss_pi = -(torch.min(ratio * adv, clip_adv)).mean()
                        loss_pi.backward()
                        policy_opt.step()
                for i in range (0, v_train_itrs):
                        value_opt.zero_grad()
                       # ret, ob = data['ret'], data['obs']
                        val = vi(o)
                        value_loss = F.mse_loss(val, ret)
                        value_loss.backward()
                        value_opt.step()
                #pbar.update(1)
        pbar.close()



In [9]:
#python.dataScience.textOutputLimit = 0
ppo(env)


  0%|          | 2776/1000000 [00:00<04:07, 4021.94it/s]

episode: 99 	 episode_reward: 17.375 	 total steps:2016


  value_loss = F.mse_loss(val, ret)
  0%|          | 4805/1000000 [00:01<04:23, 3781.04it/s]

episode: 199 	 episode_reward: 31.625 	 total steps:4175


  1%|          | 6900/1000000 [00:01<04:01, 4110.68it/s]

episode: 299 	 episode_reward: 16.875 	 total steps:6308


  1%|          | 8967/1000000 [00:02<04:10, 3960.31it/s]

episode: 399 	 episode_reward: 17.75 	 total steps:8478


  1%|          | 11463/1000000 [00:02<03:58, 4138.43it/s]

episode: 499 	 episode_reward: 21.625 	 total steps:10712


  1%|▏         | 13544/1000000 [00:03<04:03, 4042.90it/s]

episode: 599 	 episode_reward: 22.0 	 total steps:12857


  2%|▏         | 15600/1000000 [00:03<04:00, 4090.08it/s]

episode: 699 	 episode_reward: 22.625 	 total steps:15063


  2%|▏         | 17698/1000000 [00:04<03:58, 4113.43it/s]

episode: 799 	 episode_reward: 21.0 	 total steps:17224


  2%|▏         | 19770/1000000 [00:04<03:59, 4087.34it/s]

episode: 899 	 episode_reward: 25.5 	 total steps:19390


  2%|▏         | 22219/1000000 [00:05<04:02, 4036.22it/s]

episode: 999 	 episode_reward: 28.25 	 total steps:21761


  2%|▏         | 24672/1000000 [00:06<04:17, 3794.42it/s]

episode: 1099 	 episode_reward: 20.125 	 total steps:24090


  3%|▎         | 27140/1000000 [00:06<03:58, 4076.82it/s]

episode: 1199 	 episode_reward: 22.5 	 total steps:26361


  3%|▎         | 29198/1000000 [00:07<04:05, 3959.83it/s]

episode: 1299 	 episode_reward: 25.625 	 total steps:28541


  3%|▎         | 31671/1000000 [00:07<03:56, 4088.19it/s]

episode: 1399 	 episode_reward: 28.125 	 total steps:31198


  3%|▎         | 34171/1000000 [00:08<03:53, 4128.23it/s]

episode: 1499 	 episode_reward: 20.5 	 total steps:33495


  4%|▎         | 36278/1000000 [00:09<04:13, 3799.16it/s]

episode: 1599 	 episode_reward: 24.875 	 total steps:35930


  4%|▍         | 38735/1000000 [00:09<03:56, 4061.17it/s]

episode: 1699 	 episode_reward: 27.625 	 total steps:38313


  4%|▍         | 41235/1000000 [00:10<03:59, 3999.74it/s]

episode: 1799 	 episode_reward: 27.0 	 total steps:40532


  4%|▍         | 43290/1000000 [00:10<03:53, 4096.35it/s]

episode: 1899 	 episode_reward: 15.0 	 total steps:42759


  5%|▍         | 45780/1000000 [00:11<03:54, 4064.20it/s]

episode: 1999 	 episode_reward: 23.0 	 total steps:45337


  5%|▍         | 48305/1000000 [00:12<04:09, 3816.82it/s]

episode: 2099 	 episode_reward: 27.625 	 total steps:47868


  5%|▌         | 51203/1000000 [00:12<03:50, 4113.14it/s]

episode: 2199 	 episode_reward: 26.875 	 total steps:50440


  5%|▌         | 53675/1000000 [00:13<03:55, 4021.04it/s]

episode: 2299 	 episode_reward: 29.625 	 total steps:53132


  6%|▌         | 56146/1000000 [00:14<04:05, 3838.99it/s]

episode: 2399 	 episode_reward: 17.0 	 total steps:55462


  6%|▌         | 58152/1000000 [00:14<03:57, 3960.02it/s]

episode: 2499 	 episode_reward: 24.0 	 total steps:57753


  6%|▌         | 61429/1000000 [00:15<03:53, 4014.83it/s]

episode: 2599 	 episode_reward: 27.0 	 total steps:60648


  6%|▋         | 63939/1000000 [00:16<03:46, 4137.63it/s]

episode: 2699 	 episode_reward: 27.0 	 total steps:63344


  7%|▋         | 66467/1000000 [00:16<03:50, 4050.89it/s]

episode: 2799 	 episode_reward: 26.0 	 total steps:65930


  7%|▋         | 68962/1000000 [00:17<03:52, 4010.58it/s]

episode: 2899 	 episode_reward: 32.25 	 total steps:68496


  7%|▋         | 71509/1000000 [00:17<03:40, 4217.70it/s]

episode: 2999 	 episode_reward: 25.0 	 total steps:71071


  7%|▋         | 74046/1000000 [00:18<03:44, 4130.18it/s]

episode: 3099 	 episode_reward: 22.5 	 total steps:73575


  8%|▊         | 76580/1000000 [00:19<03:50, 4006.39it/s]

episode: 3199 	 episode_reward: 22.125 	 total steps:75893


  8%|▊         | 79103/1000000 [00:19<03:38, 4213.10it/s]

episode: 3299 	 episode_reward: 23.75 	 total steps:78381


  8%|▊         | 81651/1000000 [00:20<03:41, 4142.62it/s]

episode: 3399 	 episode_reward: 32.625 	 total steps:81083


  8%|▊         | 84173/1000000 [00:21<04:03, 3759.99it/s]

episode: 3499 	 episode_reward: 29.5 	 total steps:83561


  9%|▊         | 86609/1000000 [00:21<03:47, 4019.60it/s]

episode: 3599 	 episode_reward: 32.625 	 total steps:86127


  9%|▉         | 89485/1000000 [00:22<03:45, 4041.69it/s]

episode: 3699 	 episode_reward: 32.75 	 total steps:88692


  9%|▉         | 92378/1000000 [00:23<03:57, 3820.80it/s]

episode: 3799 	 episode_reward: 25.875 	 total steps:91674


  9%|▉         | 94859/1000000 [00:23<03:39, 4121.70it/s]

episode: 3899 	 episode_reward: 29.125 	 total steps:94133


 10%|▉         | 97344/1000000 [00:24<03:44, 4016.50it/s]

episode: 3999 	 episode_reward: 24.875 	 total steps:96855


 10%|█         | 100274/1000000 [00:25<03:57, 3789.71it/s]

episode: 4099 	 episode_reward: 34.875 	 total steps:99642


 10%|█         | 102794/1000000 [00:25<03:36, 4140.35it/s]

episode: 4199 	 episode_reward: 30.125 	 total steps:102352


 11%|█         | 105678/1000000 [00:26<03:48, 3907.13it/s]

episode: 4299 	 episode_reward: 33.0 	 total steps:105154


 11%|█         | 108624/1000000 [00:27<03:43, 3987.82it/s]

episode: 4399 	 episode_reward: 26.25 	 total steps:108036


 11%|█         | 111528/1000000 [00:27<03:33, 4156.88it/s]

episode: 4499 	 episode_reward: 33.875 	 total steps:111102


 11%|█▏        | 114472/1000000 [00:28<03:34, 4123.53it/s]

episode: 4599 	 episode_reward: 33.375 	 total steps:113756


 12%|█▏        | 117420/1000000 [00:29<03:34, 4113.17it/s]

episode: 4699 	 episode_reward: 28.25 	 total steps:116686


 12%|█▏        | 119951/1000000 [00:29<03:28, 4221.26it/s]

episode: 4799 	 episode_reward: 28.5 	 total steps:119638


 12%|█▏        | 123666/1000000 [00:30<03:33, 4095.82it/s]

episode: 4899 	 episode_reward: 30.125 	 total steps:122982


 13%|█▎        | 126622/1000000 [00:31<03:28, 4189.32it/s]

episode: 4999 	 episode_reward: 26.25 	 total steps:126195


 13%|█▎        | 129958/1000000 [00:32<03:32, 4091.40it/s]

episode: 5099 	 episode_reward: 22.375 	 total steps:129339


 13%|█▎        | 132886/1000000 [00:33<03:37, 3980.08it/s]

episode: 5199 	 episode_reward: 24.875 	 total steps:132320


 14%|█▎        | 135811/1000000 [00:33<03:27, 4169.36it/s]

episode: 5299 	 episode_reward: 27.375 	 total steps:135435


 14%|█▍        | 139601/1000000 [00:34<03:26, 4159.50it/s]

episode: 5399 	 episode_reward: 31.125 	 total steps:138918


 14%|█▍        | 142941/1000000 [00:35<03:28, 4106.09it/s]

episode: 5499 	 episode_reward: 35.125 	 total steps:142233


 15%|█▍        | 145872/1000000 [00:36<03:27, 4122.93it/s]

episode: 5599 	 episode_reward: 22.875 	 total steps:145349


 15%|█▍        | 149239/1000000 [00:37<03:32, 4007.44it/s]

episode: 5699 	 episode_reward: 36.5 	 total steps:148635


 15%|█▌        | 152560/1000000 [00:37<03:34, 3946.14it/s]

episode: 5799 	 episode_reward: 36.875 	 total steps:152033


 16%|█▌        | 155912/1000000 [00:38<03:22, 4159.54it/s]

episode: 5899 	 episode_reward: 27.875 	 total steps:155273


 16%|█▌        | 159674/1000000 [00:39<03:22, 4149.52it/s]

episode: 5999 	 episode_reward: 23.0 	 total steps:158867


 16%|█▋        | 163008/1000000 [00:40<03:21, 4148.76it/s]

episode: 6099 	 episode_reward: 26.875 	 total steps:162254


 17%|█▋        | 166363/1000000 [00:41<03:24, 4082.95it/s]

episode: 6199 	 episode_reward: 31.25 	 total steps:165918


 17%|█▋        | 170164/1000000 [00:42<03:18, 4186.62it/s]

episode: 6299 	 episode_reward: 28.75 	 total steps:169661


 17%|█▋        | 173899/1000000 [00:43<03:29, 3937.43it/s]

episode: 6399 	 episode_reward: 40.0 	 total steps:173346


 18%|█▊        | 177616/1000000 [00:44<03:21, 4073.42it/s]

episode: 6499 	 episode_reward: 28.875 	 total steps:177041


 18%|█▊        | 181855/1000000 [00:45<03:15, 4180.23it/s]

episode: 6599 	 episode_reward: 33.375 	 total steps:181039


 19%|█▊        | 185183/1000000 [00:46<03:24, 3978.85it/s]

episode: 6699 	 episode_reward: 21.375 	 total steps:184453


 19%|█▉        | 188887/1000000 [00:46<03:23, 3979.76it/s]

episode: 6799 	 episode_reward: 37.875 	 total steps:188447


 19%|█▉        | 192691/1000000 [00:47<03:23, 3961.96it/s]

episode: 6899 	 episode_reward: 39.125 	 total steps:192262


 20%|█▉        | 196865/1000000 [00:48<03:18, 4040.63it/s]

episode: 6999 	 episode_reward: 46.875 	 total steps:196161


 20%|██        | 200659/1000000 [00:49<03:19, 4004.37it/s]

episode: 7099 	 episode_reward: 50.5 	 total steps:200218


 20%|██        | 204845/1000000 [00:50<03:17, 4026.49it/s]

episode: 7199 	 episode_reward: 37.125 	 total steps:204118


 21%|██        | 208682/1000000 [00:51<03:18, 3986.76it/s]

episode: 7299 	 episode_reward: 51.0 	 total steps:208060


 21%|██        | 212119/1000000 [00:52<03:18, 3968.00it/s]

episode: 7399 	 episode_reward: 43.75 	 total steps:211780


 22%|██▏       | 216350/1000000 [00:53<03:27, 3777.92it/s]

episode: 7499 	 episode_reward: 41.125 	 total steps:215696


 22%|██▏       | 220556/1000000 [00:54<03:18, 3918.09it/s]

episode: 7599 	 episode_reward: 43.0 	 total steps:219913


 22%|██▏       | 224744/1000000 [00:55<03:17, 3929.55it/s]

episode: 7699 	 episode_reward: 45.875 	 total steps:224166


 23%|██▎       | 228891/1000000 [00:56<03:15, 3938.89it/s]

episode: 7799 	 episode_reward: 32.5 	 total steps:228349


 23%|██▎       | 233384/1000000 [00:57<03:15, 3930.56it/s]

episode: 7899 	 episode_reward: 32.75 	 total steps:232728


 24%|██▍       | 237517/1000000 [00:58<03:10, 3993.51it/s]

episode: 7999 	 episode_reward: 41.25 	 total steps:237065


 24%|██▍       | 242072/1000000 [01:00<03:05, 4075.62it/s]

episode: 8099 	 episode_reward: 44.75 	 total steps:241280


 25%|██▍       | 246662/1000000 [01:01<03:04, 4087.54it/s]

episode: 8199 	 episode_reward: 55.5 	 total steps:245942


 25%|██▌       | 251254/1000000 [01:02<02:59, 4171.98it/s]

episode: 8299 	 episode_reward: 38.625 	 total steps:250434


 26%|██▌       | 255885/1000000 [01:03<02:58, 4166.77it/s]

episode: 8399 	 episode_reward: 35.125 	 total steps:255133


 26%|██▌       | 260074/1000000 [01:04<03:11, 3859.20it/s]

episode: 8499 	 episode_reward: 43.875 	 total steps:259715


 27%|██▋       | 265111/1000000 [01:05<03:01, 4037.99it/s]

episode: 8599 	 episode_reward: 59.0 	 total steps:264286


 27%|██▋       | 269685/1000000 [01:06<03:00, 4043.30it/s]

episode: 8699 	 episode_reward: 51.625 	 total steps:269007


 27%|██▋       | 274276/1000000 [01:07<02:55, 4126.03it/s]

episode: 8799 	 episode_reward: 40.0 	 total steps:273662


 28%|██▊       | 279285/1000000 [01:09<02:53, 4159.16it/s]

episode: 8899 	 episode_reward: 65.25 	 total steps:278647


 28%|██▊       | 283886/1000000 [01:10<02:51, 4176.95it/s]

episode: 8999 	 episode_reward: 41.125 	 total steps:283248


 29%|██▉       | 288995/1000000 [01:11<02:55, 4061.07it/s]

episode: 9099 	 episode_reward: 47.25 	 total steps:288191


 29%|██▉       | 293603/1000000 [01:12<02:53, 4078.17it/s]

episode: 9199 	 episode_reward: 34.25 	 total steps:292848


 30%|██▉       | 298631/1000000 [01:13<02:47, 4191.25it/s]

episode: 9299 	 episode_reward: 53.25 	 total steps:298121


 30%|███       | 304116/1000000 [01:15<02:58, 3895.24it/s]

episode: 9399 	 episode_reward: 53.0 	 total steps:303429


 31%|███       | 309569/1000000 [01:16<02:48, 4106.11it/s]

episode: 9499 	 episode_reward: 46.125 	 total steps:308823


 32%|███▏      | 315062/1000000 [01:17<02:42, 4219.21it/s]

episode: 9599 	 episode_reward: 52.0 	 total steps:314496


 32%|███▏      | 320135/1000000 [01:19<02:55, 3874.25it/s]

episode: 9699 	 episode_reward: 61.875 	 total steps:319409


 33%|███▎      | 325536/1000000 [01:20<02:47, 4032.88it/s]

episode: 9799 	 episode_reward: 57.875 	 total steps:324869


 33%|███▎      | 330575/1000000 [01:21<02:40, 4160.52it/s]

episode: 9899 	 episode_reward: 47.875 	 total steps:330127


 34%|███▎      | 336452/1000000 [01:23<02:47, 3956.42it/s]

episode: 9999 	 episode_reward: 61.5 	 total steps:335907


 34%|███▍      | 342336/1000000 [01:24<02:38, 4158.68it/s]

episode: 10099 	 episode_reward: 66.0 	 total steps:341698


 35%|███▍      | 347769/1000000 [01:26<02:35, 4189.61it/s]

episode: 10199 	 episode_reward: 55.875 	 total steps:347033


 35%|███▌      | 353657/1000000 [01:27<02:39, 4063.72it/s]

episode: 10299 	 episode_reward: 63.0 	 total steps:352925


 36%|███▌      | 358717/1000000 [01:28<02:32, 4205.18it/s]

episode: 10399 	 episode_reward: 41.375 	 total steps:358217


 36%|███▋      | 364687/1000000 [01:30<02:40, 3970.51it/s]

episode: 10499 	 episode_reward: 64.375 	 total steps:364005


 37%|███▋      | 370559/1000000 [01:31<02:31, 4156.86it/s]

episode: 10599 	 episode_reward: 60.25 	 total steps:370098


 38%|███▊      | 376880/1000000 [01:33<02:33, 4049.06it/s]

episode: 10699 	 episode_reward: 61.5 	 total steps:376059


 38%|███▊      | 382742/1000000 [01:34<02:27, 4193.27it/s]

episode: 10799 	 episode_reward: 60.0 	 total steps:381892


 39%|███▉      | 389502/1000000 [01:36<02:28, 4106.82it/s]

episode: 10899 	 episode_reward: 74.875 	 total steps:388987


 40%|███▉      | 395795/1000000 [01:37<02:25, 4155.17it/s]

episode: 10999 	 episode_reward: 68.125 	 total steps:395268


 40%|████      | 402091/1000000 [01:39<02:25, 4098.76it/s]

episode: 11099 	 episode_reward: 73.125 	 total steps:401604


 41%|████      | 408340/1000000 [01:40<02:35, 3811.21it/s]

episode: 11199 	 episode_reward: 69.0 	 total steps:407927


 42%|████▏     | 415028/1000000 [01:42<02:20, 4177.87it/s]

episode: 11299 	 episode_reward: 58.375 	 total steps:414441


 42%|████▏     | 421312/1000000 [01:44<02:22, 4053.86it/s]

episode: 11399 	 episode_reward: 76.0 	 total steps:420845


 43%|████▎     | 427973/1000000 [01:45<02:17, 4148.63it/s]

episode: 11499 	 episode_reward: 63.0 	 total steps:427460


 44%|████▎     | 435070/1000000 [01:47<02:15, 4169.11it/s]

episode: 11599 	 episode_reward: 61.625 	 total steps:434416


 44%|████▍     | 441765/1000000 [01:49<02:15, 4121.13it/s]

episode: 11699 	 episode_reward: 85.5 	 total steps:440983


 45%|████▍     | 448524/1000000 [01:50<02:18, 3992.50it/s]

episode: 11799 	 episode_reward: 67.0 	 total steps:448085


 46%|████▌     | 455717/1000000 [01:52<02:09, 4188.55it/s]

episode: 11899 	 episode_reward: 76.75 	 total steps:455148


 46%|████▋     | 462914/1000000 [01:54<02:07, 4220.97it/s]

episode: 11999 	 episode_reward: 64.125 	 total steps:462241


 47%|████▋     | 470080/1000000 [01:56<02:08, 4113.50it/s]

episode: 12099 	 episode_reward: 87.125 	 total steps:469548


 48%|████▊     | 477320/1000000 [01:57<02:06, 4139.76it/s]

episode: 12199 	 episode_reward: 59.5 	 total steps:476877


 48%|████▊     | 484530/1000000 [01:59<02:07, 4039.32it/s]

episode: 12299 	 episode_reward: 70.0 	 total steps:484001


 49%|████▉     | 492220/1000000 [02:01<02:07, 3996.40it/s]

episode: 12399 	 episode_reward: 55.875 	 total steps:491704


 50%|█████     | 501042/1000000 [02:03<02:03, 4055.36it/s]

episode: 12499 	 episode_reward: 84.0 	 total steps:500580


 51%|█████     | 509867/1000000 [02:05<02:01, 4034.79it/s]

episode: 12599 	 episode_reward: 92.125 	 total steps:509425


 52%|█████▏    | 519451/1000000 [02:08<01:53, 4221.95it/s]

episode: 12699 	 episode_reward: 108.0 	 total steps:518738


 53%|█████▎    | 529257/1000000 [02:10<01:56, 4056.01it/s]

episode: 12799 	 episode_reward: 95.625 	 total steps:528587


 54%|█████▍    | 538919/1000000 [02:12<01:49, 4193.34it/s]

episode: 12899 	 episode_reward: 95.0 	 total steps:538489


 55%|█████▌    | 550225/1000000 [02:15<01:47, 4176.31it/s]

episode: 12999 	 episode_reward: 106.625 	 total steps:549546


 56%|█████▌    | 562144/1000000 [02:18<01:48, 4042.14it/s]

episode: 13099 	 episode_reward: 118.875 	 total steps:561599


 57%|█████▋    | 573733/1000000 [02:21<01:45, 4056.92it/s]

episode: 13199 	 episode_reward: 123.125 	 total steps:572904


 59%|█████▊    | 585906/1000000 [02:24<01:39, 4144.73it/s]

episode: 13299 	 episode_reward: 124.125 	 total steps:585217


 59%|█████▉    | 593980/1000000 [02:26<01:37, 4158.94it/s]