In [4]:
import tensorflow as tf
import gym
from IPython.display import HTML
from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from gym.envs.box2d import CarRacing
from stable_baselines.common import make_vec_env
from stable_baselines.common.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv,VecVideoRecorder
from stable_baselines import PPO2
import glob
import base64
import wandb
import time
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Any, Dict

## General Functions

In [2]:
from stable_baselines.common.callbacks import BaseCallback

class PPOCustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super(PPOCustomCallback, self).__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.episodes = 0
        self.total_episode_reward = np.array([0,0,0,0], dtype=np.float32)

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        # update commulative reward to log at the end of every episode
        self.total_episode_reward += self.locals["rewards"]
        # at the end of every episode       
        if np.array(self.locals["mb_dones"][-1]).any():
            # log the reward value if its time to not log 2 times
            if self.episodes % self.locals["log_interval"] != 0: 
                wandb.log({"reward_per_episode": np.mean(self.total_episode_reward[np.array(self.locals["mb_dones"][-1])==True]) })
            
            # if log interval has passed
            if self.episodes % self.locals["log_interval"] == 0:
                # Save your model and optimizer
                self.model.save(MODEL_SAVE_NAME)
                # Save as artifact for version control.
                artifact = wandb.Artifact(MODEL_SAVE_NAME, type='model')
                artifact.add_file(MODEL_SAVE_NAME+".zip")
                wandb.log_artifact(artifact)
                wandb.log({"reward_per_episode": np.mean(self.total_episode_reward[np.array(self.locals["mb_dones"][-1])==True]) })
            self.episodes += 1
            self.total_episode_reward[np.array(self.locals["mb_dones"][-1])==True] = 0

        
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the learn() method.
        """
        pass

# Baseline environment

## Training

In [2]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=0,
        max_step_reward=1
        )

In [4]:
num = 1

LOG_INTERVAL        = 100
WANDB_ID            = "" + str(num)
WNDB_NAME           = "Antoine" + str(num)
LOAD_SAVED_MODEL    = True
MODEL_SAVE_NAME     = "PPO_MODEL_" + str(num)
SAVED_MODEL_VERSION = "latest"

os.environ["WANDB_ENTITY"]  = "ant_ai"
os.environ["WANDB_PROJECT"] = "PPO_No_Obstacles"
os.environ["WANDB_RESUME"]  = "allow"
wandb.init(resume=WANDB_ID)
wandb.run.name = WNDB_NAME

env = make_vec_env(env, n_envs=4)

# Load model
if LOAD_SAVED_MODEL:
    try:
        model_artifact = wandb.use_artifact(MODEL_SAVE_NAME+':'+SAVED_MODEL_VERSION, type='model')
        artifact_dir = model_artifact.download()
        PPOmodel = PPO2.load(artifact_dir+"/"+MODEL_SAVE_NAME, env=env)
        print("LOAD SAVED PPΟ MODEL")

    except:
        print("NO MODEL FOUND")
else:
    if 'PPOmodel' not in globals():
        PPOmodel = PPO2(CnnPolicy, env, verbose=1)
        print("INITIALIZE NEW PPO MODEL")
    else:
        PPOmodel = PPO2.load(MODEL_SAVE_NAME, env=env)
        print("CONTINUE PPO MODEL TRAINING")

        # Train model
PPOmodel.learn(total_timesteps=2000000, log_interval=LOG_INTERVAL, callback=PPOCustomCallback())
PPOmodel.save(MODEL_SAVE_NAME)
env.close()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mant_ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   1 of 1 files downloaded.  











Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



LOAD SAVED PPΟ MODEL
-------------------------------------
| approxkl           | 0.008186035  |
| clipfrac           | 0.104003906  |
| explained_variance | 0.761        |
| fps                | 71           |
| n_updates          | 1            |
| policy_entropy     | 0.54089534   |
| policy_loss        | -0.008199417 |
| serial_timesteps   | 128          |
| time_elapsed       | 0            |
| total_timesteps    | 512          |
| value_loss         | 0.6223811    |
-------------------------------------
--------------------------------------
| approxkl           | 0.004611495   |
| clipfrac           | 0.061523438   |
| ep_len_mean        | 541           |
| ep_reward_mean     | 54.6          |
| explained_variance | 0.947         |
| fps                | 87            |
| n_updates          | 100           |
| policy_entropy     | 0.6423588     |
| policy_loss        | -0.00

--------------------------------------
| approxkl           | 0.0070926268  |
| clipfrac           | 0.06640625    |
| ep_len_mean        | 581           |
| ep_reward_mean     | 55.2          |
| explained_variance | 0.941         |
| fps                | 85            |
| n_updates          | 1000          |
| policy_entropy     | 0.4912929     |
| policy_loss        | -0.0036474462 |
| serial_timesteps   | 128000        |
| time_elapsed       | 5.88e+03      |
| total_timesteps    | 512000        |
| value_loss         | 23.82411      |
--------------------------------------
--------------------------------------
| approxkl           | 0.0031172594  |
| clipfrac           | 0.04345703    |
| ep_len_mean        | 532           |
| ep_reward_mean     | 76.3          |
| explained_variance | 0.528         |
| fps                | 94            |
| n_updates          | 1100          |
| policy_entropy     | 0.43890023    |
| policy_loss        | -0.0025658493 |
| serial_timesteps   | 14

---------------------------------------
| approxkl           | 0.0018591783   |
| clipfrac           | 0.017089844    |
| ep_len_mean        | 486            |
| ep_reward_mean     | 89.4           |
| explained_variance | 0.578          |
| fps                | 87             |
| n_updates          | 2500           |
| policy_entropy     | 0.47421846     |
| policy_loss        | -0.00033091567 |
| serial_timesteps   | 320000         |
| time_elapsed       | 1.47e+04       |
| total_timesteps    | 1280000        |
| value_loss         | 102.04931      |
---------------------------------------
--------------------------------------
| approxkl           | 0.0045541925  |
| clipfrac           | 0.05810547    |
| ep_len_mean        | 488           |
| ep_reward_mean     | 65.9          |
| explained_variance | 0.633         |
| fps                | 87            |
| n_updates          | 2600          |
| policy_entropy     | 0.49139935    |
| policy_loss        | -0.0017980405 |
| serial_t

## Evaluate model

In [2]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0)
env = DummyVecEnv([env])

In [3]:
model = PPO2.load('Model Weights/PPO/Baseline Environment Agent/PPO_MODEL_1-v98/PPO_MODEL_1',env=env)

model.set_env(env)

for episode in range(10):
    obs = env.reset()
    total_reward = 0
    done=False
    while not done:
        action, states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    print(f'Episode: {episode}, Total Reward: {total_reward}')
env.close()

Episode: 0, Total Reward: [159.09991]
Episode: 1, Total Reward: [152.39993]
Episode: 2, Total Reward: [151.70003]
Episode: 3, Total Reward: [145.19997]
Episode: 4, Total Reward: [148.49901]
Episode: 5, Total Reward: [153.39998]
Episode: 6, Total Reward: [154.09999]
Episode: 7, Total Reward: [130.20055]
Episode: 8, Total Reward: [163.4997]
Episode: 9, Total Reward: [150.79999]


# Hard environment (Obstacles, Bonus, intersection, more complex tracks)

- Initial model is the one trained on the baseline environment.
- The idea is to fine tune it to work in a more complex environment 
- Bonus value = 50 vs Obstacle value = -50 (hard coded in environment code)

Complexity is set through 3 parameters: 
- num_tracks: 1 for basic track, 2 for more complex tracks with intersection (X, t type)
- num_lanes: number of lanes for the track 1 for normal, 2 for multiple lanes
- num_lanes_changes: number of time two lanes will be merged into 1 over the track

Obstacles/Bonus are controled by 2 parameters
- num_obstacles: number of obstacles by sub section of the track (if different than 0 then there are obstacles)
If the car touched an obstacle, 50 points are deducted from the score
- prop_good_obstacles: probability (set between 0-1) for an obstacles to be a bonus (yellow color instead of red + reward 50 points if touched)


## Training

In [5]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=2,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=5,
        prop_good_obstacles=0.5)

In [6]:
num = 1

LOG_INTERVAL        = 100
WANDB_ID            = "" + str(num)
WNDB_NAME           = "Antoine" + str(num)
LOAD_SAVED_MODEL    = False
MODEL_SAVE_NAME     = "PPO_MODEL_Obstacles" + str(num)
SAVED_MODEL_VERSION = "latest"

os.environ["WANDB_ENTITY"]  = "ant_ai"
os.environ["WANDB_PROJECT"] = "PPO_With_Obstacles"
os.environ["WANDB_RESUME"]  = "allow"
wandb.init(resume=WANDB_ID)
wandb.run.name = WNDB_NAME

env = make_vec_env(env, n_envs=4)

# Load model
if LOAD_SAVED_MODEL:
    try:
        model_artifact = wandb.use_artifact(MODEL_SAVE_NAME+':'+SAVED_MODEL_VERSION, type='model')
        artifact_dir = model_artifact.download()
        PPOmodel = PPO2.load(artifact_dir+"/"+MODEL_SAVE_NAME, env=env)
        print("LOAD SAVED PPΟ MODEL")

    except:
        print("NO MODEL FOUND")
else:
    if 'PPOmodel' not in globals():
        PPOmodel = PPO2.load('artifacts/PPO_MODEL_1-v98/PPO_MODEL_1',env=env)
        print("INITIALIZE NEW PPO MODEL")
    else:
        PPOmodel = PPO2.load(MODEL_SAVE_NAME, env=env)
        print("CONTINUE PPO MODEL TRAINING")

        # Train model
PPOmodel.learn(total_timesteps=5000000, log_interval=LOG_INTERVAL, callback=PPOCustomCallback())
PPOmodel.save(MODEL_SAVE_NAME)
env.close()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mant_ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INITIALIZE NEW PPO MODEL
-------------------------------------
| approxkl           | 0.019688921  |
| clipfrac           | 0.1977539    |
| explained_variance | 0.249        |
| fps                | 50           |
| n_updates          | 1            |
| policy_entropy     | 0.68237      |
| policy_loss        | -0.009537595 |
| serial_timesteps   | 128          |
| time_elapsed       | 0            |
| total_timesteps    | 512          |
| value_loss         | 22.456802    |
-------------------------------------
---------------------------------------
| approxkl           | 0.005504113    |
| clipfrac           | 0.0703125      |
| ep_len_mean        | 1.1e+03        |
| ep_reward_mean     | 114            |
| explained_variance | 0.291          |
| fps                | 74             |
| n_updates          | 100            |
| policy_entropy     | 0.7449099      |
| policy_loss        | -0.00097121863 |
| serial_timesteps   | 12800          |
| time_elapsed       | 627            |
|

-------------------------------------
| approxkl           | 0.008492806  |
| clipfrac           | 0.10058594   |
| ep_len_mean        | 1.03e+03     |
| ep_reward_mean     | 142          |
| explained_variance | 0.00212      |
| fps                | 88           |
| n_updates          | 1500         |
| policy_entropy     | 0.75509036   |
| policy_loss        | -0.020744074 |
| serial_timesteps   | 192000       |
| time_elapsed       | 9.35e+03     |
| total_timesteps    | 768000       |
| value_loss         | 1.4775622    |
-------------------------------------
--------------------------------------
| approxkl           | 0.0059611653  |
| clipfrac           | 0.080078125   |
| ep_len_mean        | 979           |
| ep_reward_mean     | 137           |
| explained_variance | 0.112         |
| fps                | 79            |
| n_updates          | 1600          |
| policy_entropy     | 0.7214931     |
| policy_loss        | -0.0013934328 |
| serial_timesteps   | 204800        |
|

--------------------------------------
| approxkl           | 0.014655727   |
| clipfrac           | 0.07861328    |
| ep_len_mean        | 800           |
| ep_reward_mean     | 133           |
| explained_variance | 0.519         |
| fps                | 89            |
| n_updates          | 3000          |
| policy_entropy     | 0.49510506    |
| policy_loss        | -0.0076194443 |
| serial_timesteps   | 384000        |
| time_elapsed       | 1.87e+04      |
| total_timesteps    | 1536000       |
| value_loss         | 61.84205      |
--------------------------------------
-------------------------------------
| approxkl           | 0.0033818914 |
| clipfrac           | 0.03173828   |
| ep_len_mean        | 949          |
| ep_reward_mean     | 144          |
| explained_variance | 0.163        |
| fps                | 80           |
| n_updates          | 3100         |
| policy_entropy     | 0.79401845   |
| policy_loss        | -0.002059787 |
| serial_timesteps   | 396800      

--------------------------------------
| approxkl           | 0.004585153   |
| clipfrac           | 0.06298828    |
| ep_len_mean        | 856           |
| ep_reward_mean     | 135           |
| explained_variance | 0.703         |
| fps                | 87            |
| n_updates          | 4500          |
| policy_entropy     | 0.42151642    |
| policy_loss        | -0.0025164934 |
| serial_timesteps   | 576000        |
| time_elapsed       | 2.81e+04      |
| total_timesteps    | 2304000       |
| value_loss         | 16.7984       |
--------------------------------------
-------------------------------------
| approxkl           | 0.009070338  |
| clipfrac           | 0.06201172   |
| ep_len_mean        | 856          |
| ep_reward_mean     | 111          |
| explained_variance | 0.859        |
| fps                | 86           |
| n_updates          | 4600         |
| policy_entropy     | 0.46444973   |
| policy_loss        | -0.019557245 |
| serial_timesteps   | 588800      

--------------------------------------
| approxkl           | 0.006166261   |
| clipfrac           | 0.087402344   |
| ep_len_mean        | 836           |
| ep_reward_mean     | 149           |
| explained_variance | 0.647         |
| fps                | 88            |
| n_updates          | 6000          |
| policy_entropy     | 0.5455546     |
| policy_loss        | 0.00021869713 |
| serial_timesteps   | 768000        |
| time_elapsed       | 3.76e+04      |
| total_timesteps    | 3072000       |
| value_loss         | 25.266825     |
--------------------------------------
------------------------------------
| approxkl           | 0.007923445 |
| clipfrac           | 0.092285156 |
| ep_len_mean        | 831         |
| ep_reward_mean     | 150         |
| explained_variance | 0.834       |
| fps                | 85          |
| n_updates          | 6100        |
| policy_entropy     | 0.6249434   |
| policy_loss        | -0.01039305 |
| serial_timesteps   | 780800      |
| time_e

--------------------------------------
| approxkl           | 0.0039464696  |
| clipfrac           | 0.052734375   |
| ep_len_mean        | 816           |
| ep_reward_mean     | 143           |
| explained_variance | 0.685         |
| fps                | 71            |
| n_updates          | 7500          |
| policy_entropy     | 0.69118667    |
| policy_loss        | -0.0022888347 |
| serial_timesteps   | 960000        |
| time_elapsed       | 4.71e+04      |
| total_timesteps    | 3840000       |
| value_loss         | 76.38838      |
--------------------------------------
-------------------------------------
| approxkl           | 0.0040715868 |
| clipfrac           | 0.04736328   |
| ep_len_mean        | 890          |
| ep_reward_mean     | 153          |
| explained_variance | 0.742        |
| fps                | 88           |
| n_updates          | 7600         |
| policy_entropy     | 0.6715275    |
| policy_loss        | -0.011488377 |
| serial_timesteps   | 972800      

-------------------------------------
| approxkl           | 0.0044385414 |
| clipfrac           | 0.0546875    |
| ep_len_mean        | 799          |
| ep_reward_mean     | 159          |
| explained_variance | 0.255        |
| fps                | 76           |
| n_updates          | 9000         |
| policy_entropy     | 0.59396213   |
| policy_loss        | -0.007685321 |
| serial_timesteps   | 1152000      |
| time_elapsed       | 5.65e+04     |
| total_timesteps    | 4608000      |
| value_loss         | 6.30007      |
-------------------------------------
-------------------------------------
| approxkl           | 0.0070681768 |
| clipfrac           | 0.08642578   |
| ep_len_mean        | 820          |
| ep_reward_mean     | 175          |
| explained_variance | 0.832        |
| fps                | 87           |
| n_updates          | 9100         |
| policy_entropy     | 0.65974665   |
| policy_loss        | -0.008000364 |
| serial_timesteps   | 1164800      |
| time_elaps

In [7]:
wandb.finish()

0,1
reward_per_episode,▄▇▄▄▆▇▄▆▄▅▇▇▆▇▆▃▅▇▇▆▇▇▇▆▆█▆▁▂▆██▆▆█▅██▆▇

0,1
reward_per_episode,260.59921


## Evaluate model

In [5]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=2,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=10,
        prop_good_obstacles=0.5)
env = DummyVecEnv([env])

In [7]:
model = PPO2.load('Model Weights/PPO/Complex Environment Agent/PPO_MODEL_OBS-v56/PPO_MODEL_Obstacles1',env=env)

model.set_env(env)

test_set=[]
for episode in tqdm(range(100)):
    obs = env.reset()
    total_reward = 0
    done=False
    while not done:
        action, states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    print(f'Episode: {episode}, Total Reward: {total_reward}')
    test_set.append(total_reward)
env.close()

print('Average Reward: ',np.mean(test_set))

  1%|▊                                                                                 | 1/100 [00:17<29:15, 17.73s/it]

Episode: 0, Total Reward: [228.20038]


  2%|█▋                                                                                | 2/100 [00:29<23:29, 14.38s/it]

Episode: 1, Total Reward: [116.59984]


  3%|██▍                                                                               | 3/100 [00:41<21:38, 13.38s/it]

Episode: 2, Total Reward: [84.299446]


  4%|███▎                                                                              | 4/100 [00:54<20:48, 13.00s/it]

Episode: 3, Total Reward: [-8.499313]


  5%|████                                                                              | 5/100 [01:28<32:44, 20.68s/it]

Episode: 4, Total Reward: [402.49255]


  6%|████▉                                                                             | 6/100 [01:47<31:34, 20.15s/it]

Episode: 5, Total Reward: [177.99765]


  7%|█████▋                                                                            | 7/100 [02:10<32:23, 20.90s/it]

Episode: 6, Total Reward: [411.7957]


  8%|██████▌                                                                           | 8/100 [02:27<30:29, 19.89s/it]

Episode: 7, Total Reward: [74.70051]


  9%|███████▍                                                                          | 9/100 [02:42<27:44, 18.29s/it]

Episode: 8, Total Reward: [265.4976]


 10%|████████                                                                         | 10/100 [03:00<27:05, 18.06s/it]

Episode: 9, Total Reward: [210.49835]


 11%|████████▉                                                                        | 11/100 [03:17<26:29, 17.86s/it]

Episode: 10, Total Reward: [366.19763]


 12%|█████████▋                                                                       | 12/100 [03:20<19:16, 13.14s/it]

Episode: 11, Total Reward: [-55.699883]


 13%|██████████▌                                                                      | 13/100 [03:34<19:37, 13.53s/it]

Episode: 12, Total Reward: [266.0986]


 14%|███████████▎                                                                     | 14/100 [03:43<17:18, 12.07s/it]

Episode: 13, Total Reward: [8.600517]


 15%|████████████▏                                                                    | 15/100 [03:57<18:01, 12.72s/it]

Episode: 14, Total Reward: [212.49863]


 16%|████████████▉                                                                    | 16/100 [04:15<19:53, 14.21s/it]

Episode: 15, Total Reward: [282.2976]


 17%|█████████████▊                                                                   | 17/100 [04:27<18:50, 13.62s/it]

Episode: 16, Total Reward: [231.79892]


 18%|██████████████▌                                                                  | 18/100 [04:42<19:23, 14.19s/it]

Episode: 17, Total Reward: [165.39749]


 19%|███████████████▍                                                                 | 19/100 [05:02<21:25, 15.87s/it]

Episode: 18, Total Reward: [296.49835]


 20%|████████████████▏                                                                | 20/100 [05:21<22:28, 16.86s/it]

Episode: 19, Total Reward: [258.29697]


 21%|█████████████████                                                                | 21/100 [05:50<26:53, 20.43s/it]

Episode: 20, Total Reward: [172.1991]


 22%|█████████████████▊                                                               | 22/100 [06:03<23:38, 18.18s/it]

Episode: 21, Total Reward: [370.6972]


 23%|██████████████████▋                                                              | 23/100 [06:06<17:19, 13.50s/it]

Episode: 22, Total Reward: [-105.7]


 24%|███████████████████▍                                                             | 24/100 [06:25<19:22, 15.29s/it]

Episode: 23, Total Reward: [476.5951]


 25%|████████████████████▎                                                            | 25/100 [06:59<26:02, 20.83s/it]

Episode: 24, Total Reward: [-33.69881]


 26%|█████████████████████                                                            | 26/100 [07:23<27:01, 21.91s/it]

Episode: 25, Total Reward: [186.70084]


 27%|█████████████████████▊                                                           | 27/100 [07:30<21:04, 17.32s/it]

Episode: 26, Total Reward: [-117.000015]


 28%|██████████████████████▋                                                          | 28/100 [07:42<18:48, 15.68s/it]

Episode: 27, Total Reward: [195.49985]


 29%|███████████████████████▍                                                         | 29/100 [08:03<20:29, 17.32s/it]

Episode: 28, Total Reward: [326.69632]


 30%|████████████████████████▎                                                        | 30/100 [08:15<18:27, 15.82s/it]

Episode: 29, Total Reward: [258.19937]


 31%|█████████████████████████                                                        | 31/100 [08:31<18:15, 15.88s/it]

Episode: 30, Total Reward: [277.49796]


 32%|█████████████████████████▉                                                       | 32/100 [08:43<16:34, 14.62s/it]

Episode: 31, Total Reward: [278.99756]


 33%|██████████████████████████▋                                                      | 33/100 [08:58<16:32, 14.81s/it]

Episode: 32, Total Reward: [321.9975]


 34%|███████████████████████████▌                                                     | 34/100 [09:19<18:09, 16.51s/it]

Episode: 33, Total Reward: [174.20027]


 35%|████████████████████████████▎                                                    | 35/100 [09:38<18:49, 17.38s/it]

Episode: 34, Total Reward: [299.598]


 36%|█████████████████████████████▏                                                   | 36/100 [09:52<17:29, 16.40s/it]

Episode: 35, Total Reward: [309.09818]


 37%|█████████████████████████████▉                                                   | 37/100 [10:15<19:13, 18.32s/it]

Episode: 36, Total Reward: [347.0962]


 38%|██████████████████████████████▊                                                  | 38/100 [10:35<19:26, 18.82s/it]

Episode: 37, Total Reward: [-32.19963]


 39%|███████████████████████████████▌                                                 | 39/100 [10:54<19:16, 18.96s/it]

Episode: 38, Total Reward: [247.69916]


 40%|████████████████████████████████▍                                                | 40/100 [11:09<17:36, 17.61s/it]

Episode: 39, Total Reward: [363.69794]


 41%|█████████████████████████████████▏                                               | 41/100 [11:23<16:28, 16.76s/it]

Episode: 40, Total Reward: [146.09961]


 42%|██████████████████████████████████                                               | 42/100 [11:56<20:50, 21.57s/it]

Episode: 41, Total Reward: [409.39822]


 43%|██████████████████████████████████▊                                              | 43/100 [12:15<19:50, 20.88s/it]

Episode: 42, Total Reward: [120.70005]


 44%|███████████████████████████████████▋                                             | 44/100 [12:34<18:43, 20.06s/it]

Episode: 43, Total Reward: [110.09954]


 45%|████████████████████████████████████▍                                            | 45/100 [12:49<17:03, 18.61s/it]

Episode: 44, Total Reward: [205.9984]


 46%|█████████████████████████████████████▎                                           | 46/100 [13:07<16:40, 18.52s/it]

Episode: 45, Total Reward: [305.99753]


 47%|██████████████████████████████████████                                           | 47/100 [13:55<24:03, 27.24s/it]

Episode: 46, Total Reward: [158.19467]


 48%|██████████████████████████████████████▉                                          | 48/100 [14:08<20:01, 23.10s/it]

Episode: 47, Total Reward: [253.90015]


 49%|███████████████████████████████████████▋                                         | 49/100 [14:27<18:26, 21.69s/it]

Episode: 48, Total Reward: [201.79916]


 50%|████████████████████████████████████████▌                                        | 50/100 [14:38<15:31, 18.62s/it]

Episode: 49, Total Reward: [245.39816]


 51%|█████████████████████████████████████████▎                                       | 51/100 [14:52<14:04, 17.24s/it]

Episode: 50, Total Reward: [311.89816]


 52%|██████████████████████████████████████████                                       | 52/100 [15:06<13:01, 16.28s/it]

Episode: 51, Total Reward: [88.799164]


 53%|██████████████████████████████████████████▉                                      | 53/100 [15:24<13:02, 16.65s/it]

Episode: 52, Total Reward: [329.99704]


 54%|███████████████████████████████████████████▋                                     | 54/100 [15:40<12:45, 16.65s/it]

Episode: 53, Total Reward: [353.39807]


 55%|████████████████████████████████████████████▌                                    | 55/100 [16:05<14:19, 19.11s/it]

Episode: 54, Total Reward: [200.99884]


 56%|█████████████████████████████████████████████▎                                   | 56/100 [16:30<15:13, 20.75s/it]

Episode: 55, Total Reward: [256.6946]


 57%|██████████████████████████████████████████████▏                                  | 57/100 [16:43<13:10, 18.39s/it]

Episode: 56, Total Reward: [82.49949]


 58%|██████████████████████████████████████████████▉                                  | 58/100 [16:59<12:24, 17.72s/it]

Episode: 57, Total Reward: [216.29709]


 59%|███████████████████████████████████████████████▊                                 | 59/100 [17:22<13:11, 19.29s/it]

Episode: 58, Total Reward: [330.99603]


 60%|████████████████████████████████████████████████▌                                | 60/100 [17:53<15:20, 23.01s/it]

Episode: 59, Total Reward: [28.200552]


 61%|█████████████████████████████████████████████████▍                               | 61/100 [18:08<13:16, 20.42s/it]

Episode: 60, Total Reward: [293.0982]


 62%|██████████████████████████████████████████████████▏                              | 62/100 [18:11<09:35, 15.16s/it]

Episode: 61, Total Reward: [-106.09999]


 63%|███████████████████████████████████████████████████                              | 63/100 [18:42<12:26, 20.17s/it]

Episode: 62, Total Reward: [171.99667]


 64%|███████████████████████████████████████████████████▊                             | 64/100 [19:52<20:57, 34.94s/it]

Episode: 63, Total Reward: [332.88235]


 65%|████████████████████████████████████████████████████▋                            | 65/100 [20:25<20:01, 34.34s/it]

Episode: 64, Total Reward: [367.7934]


 66%|█████████████████████████████████████████████████████▍                           | 66/100 [20:27<14:03, 24.82s/it]

Episode: 65, Total Reward: [-105.7]


 67%|██████████████████████████████████████████████████████▎                          | 67/100 [20:44<12:13, 22.22s/it]

Episode: 66, Total Reward: [301.39798]


 68%|███████████████████████████████████████████████████████                          | 68/100 [21:02<11:12, 21.01s/it]

Episode: 67, Total Reward: [235.39914]


 69%|███████████████████████████████████████████████████████▉                         | 69/100 [21:13<09:19, 18.06s/it]

Episode: 68, Total Reward: [250.99857]


 70%|████████████████████████████████████████████████████████▋                        | 70/100 [21:28<08:36, 17.23s/it]

Episode: 69, Total Reward: [367.79807]


 71%|█████████████████████████████████████████████████████████▌                       | 71/100 [21:41<07:37, 15.78s/it]

Episode: 70, Total Reward: [263.89822]


 72%|██████████████████████████████████████████████████████████▎                      | 72/100 [22:01<08:04, 17.29s/it]

Episode: 71, Total Reward: [192.80081]


 73%|███████████████████████████████████████████████████████████▏                     | 73/100 [22:18<07:40, 17.07s/it]

Episode: 72, Total Reward: [385.89746]


 74%|███████████████████████████████████████████████████████████▉                     | 74/100 [22:41<08:12, 18.95s/it]

Episode: 73, Total Reward: [245.99988]


 75%|████████████████████████████████████████████████████████████▊                    | 75/100 [22:55<07:12, 17.31s/it]

Episode: 74, Total Reward: [234.99838]


 76%|█████████████████████████████████████████████████████████████▌                   | 76/100 [23:11<06:50, 17.09s/it]

Episode: 75, Total Reward: [209.49829]


 77%|██████████████████████████████████████████████████████████████▎                  | 77/100 [23:27<06:24, 16.70s/it]

Episode: 76, Total Reward: [33.900524]


 78%|███████████████████████████████████████████████████████████████▏                 | 78/100 [23:42<05:58, 16.28s/it]

Episode: 77, Total Reward: [378.29755]


 79%|███████████████████████████████████████████████████████████████▉                 | 79/100 [23:57<05:28, 15.66s/it]

Episode: 78, Total Reward: [239.1991]


 80%|████████████████████████████████████████████████████████████████▊                | 80/100 [24:10<04:56, 14.84s/it]

Episode: 79, Total Reward: [220.29822]


 81%|█████████████████████████████████████████████████████████████████▌               | 81/100 [24:12<03:32, 11.18s/it]

Episode: 80, Total Reward: [-105.7]


 82%|██████████████████████████████████████████████████████████████████▍              | 82/100 [24:27<03:40, 12.26s/it]

Episode: 81, Total Reward: [148.2985]


 83%|███████████████████████████████████████████████████████████████████▏             | 83/100 [24:30<02:38,  9.33s/it]

Episode: 82, Total Reward: [-105.7]


 84%|████████████████████████████████████████████████████████████████████             | 84/100 [24:32<01:56,  7.30s/it]

Episode: 83, Total Reward: [-105.7]


 85%|████████████████████████████████████████████████████████████████████▊            | 85/100 [24:55<03:01, 12.12s/it]

Episode: 84, Total Reward: [190.19843]


 86%|█████████████████████████████████████████████████████████████████████▋           | 86/100 [25:12<03:08, 13.46s/it]

Episode: 85, Total Reward: [248.49854]


 87%|██████████████████████████████████████████████████████████████████████▍          | 87/100 [25:29<03:07, 14.43s/it]

Episode: 86, Total Reward: [101.69888]


 88%|███████████████████████████████████████████████████████████████████████▎         | 88/100 [25:45<03:00, 15.01s/it]

Episode: 87, Total Reward: [162.89977]


 89%|████████████████████████████████████████████████████████████████████████         | 89/100 [26:05<03:02, 16.62s/it]

Episode: 88, Total Reward: [180.39551]


 90%|████████████████████████████████████████████████████████████████████████▉        | 90/100 [26:25<02:53, 17.39s/it]

Episode: 89, Total Reward: [-54.899483]


 91%|█████████████████████████████████████████████████████████████████████████▋       | 91/100 [26:42<02:36, 17.36s/it]

Episode: 90, Total Reward: [201.1995]


 92%|██████████████████████████████████████████████████████████████████████████▌      | 92/100 [26:54<02:06, 15.75s/it]

Episode: 91, Total Reward: [257.49786]


 93%|███████████████████████████████████████████████████████████████████████████▎     | 93/100 [27:09<01:49, 15.57s/it]

Episode: 92, Total Reward: [221.49884]


 94%|████████████████████████████████████████████████████████████████████████████▏    | 94/100 [27:26<01:35, 15.98s/it]

Episode: 93, Total Reward: [83.29966]


 95%|████████████████████████████████████████████████████████████████████████████▉    | 95/100 [27:42<01:19, 15.89s/it]

Episode: 94, Total Reward: [67.80074]


 96%|█████████████████████████████████████████████████████████████████████████████▊   | 96/100 [27:58<01:03, 15.97s/it]

Episode: 95, Total Reward: [224.59705]


 97%|██████████████████████████████████████████████████████████████████████████████▌  | 97/100 [28:21<00:54, 18.21s/it]

Episode: 96, Total Reward: [105.800125]


 98%|███████████████████████████████████████████████████████████████████████████████▍ | 98/100 [28:41<00:37, 18.57s/it]

Episode: 97, Total Reward: [226.19849]


 99%|████████████████████████████████████████████████████████████████████████████████▏| 99/100 [28:58<00:18, 18.25s/it]

Episode: 98, Total Reward: [312.8993]


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [29:13<00:00, 17.53s/it]

Episode: 99, Total Reward: [260.49854]
Average Reward:  198.06438





In [8]:
## Get summary statistic
pd.DataFrame(test_set).describe()

Unnamed: 0,0
count,100.0
mean,198.064377
std,137.629547
min,-117.000015
25%,119.674997
50%,223.047943
75%,293.948242
max,476.595093


## Record Videos

In [3]:
'''
Complexity is set through 3 parameters: 
num_tracks: 1 for basic track, 2 for more complex tracks with intersection (X, t type)
num_lanes: number of lanes for the track 1 for normal, 2 for multiple lanes
num_lanes_changes: number of time two lanes will be merged into 1 over the track


Obstacles/Bonus are controled by 2 parameters
num_obstacles: number of obstacles by sub section of the track (if different than 0 then there are obstacles)
If the car touched an obstacle, 50 points are deducted from the score
prop_good_obstacles: probability (set between 0-1) for an obstacles to be a bonus (yellow color instead of red + reward 50 points if touched)
'''


env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=0,
        prop_good_obstacles=0.1)
env = DummyVecEnv([env])



In [4]:
# model = PPO2.load('artifacts/PPO_MODEL_OBS-v56/PPO_MODEL_Obstacles1',env=env) #Tuned agent trained on complex track and obstacles/bonus
model = PPO2.load('artifacts/PPO_MODEL_1-v98/PPO_MODEL_1',env=env) # Baseline agent trained on the base environment

# Record the video starting at the first step
env = VecVideoRecorder(env, 'video/',
                       record_video_trigger=lambda x: x == 0, video_length=1500,
                       name_prefix="ppo_agent-based")

for episode in range(10):
    obs = env.reset()
    total_reward = 0
    done=False
    while not done:
        action, states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    print(f'Episode: {episode}, Total Reward: {total_reward}')
env.close()


Episode: 0, Total Reward: [135.80063]
Episode: 1, Total Reward: [128.30043]
Episode: 2, Total Reward: [150.4997]
Episode: 3, Total Reward: [140.50046]
Episode: 4, Total Reward: [-21.799583]
Episode: 5, Total Reward: [142.3002]
Episode: 6, Total Reward: [149.90007]
Episode: 7, Total Reward: [126.70076]
Episode: 8, Total Reward: [146.3001]
Episode: 9, Total Reward: [146.09993]
