In [1]:
import tensorflow as tf
import gym
from gym.wrappers import Monitor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

from gym.envs.box2d import CarRacing
from stable_baselines.common import make_vec_env
from stable_baselines.deepq.policies import CnnPolicy
from stable_baselines.common.vec_env import DummyVecEnv,VecVideoRecorder
from stable_baselines import DQN
import glob
import base64
import wandb
import time
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Any, Dict

## General Functions

In [2]:
from stable_baselines.common.callbacks import BaseCallback

class DQNCustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super(DQNCustomCallback, self).__init__(verbose)
        # Those variables will be accessible in the callback
        # (they are defined in the base class)
        # The RL model
        # self.model = None  # type: BaseAlgorithm
        # An alias for self.model.get_env(), the environment used for training
        # self.training_env = None  # type: Union[gym.Env, VecEnv, None]
        # Number of time the callback was called
        # self.n_calls = 0  # type: int
        # self.num_timesteps = 0  # type: int
        # local and global variables
        # self.locals = None  # type: Dict[str, Any]
        # self.globals = None  # type: Dict[str, Any]
        # The logger object, used to report things in the terminal
        # self.logger = None  # stable_baselines3.common.logger
        # # Sometimes, for event callback, it is useful
        # # to have access to the parent object
        # self.parent = None  # type: Optional[BaseCallback]
        self.episodes = 0
        self.total_episode_reward = 0

    def _on_training_start(self) -> None:
        """
        This method is called before the first rollout starts.
        """
        pass

    def _on_rollout_start(self) -> None:
        """
        A rollout is the collection of environment interaction
        using the current policy.
        This event is triggered before collecting new samples.
        """
        pass

    def _on_step(self) -> bool:
        if self.episodes ==0:
            self.episodes += 1
        else:
            self.total_episode_reward += self.locals["reward_"]
            # at the end of every episode       
            if self.locals["done"]:
                if self.episodes % self.locals["log_interval"] != 0: 
                    wandb.log({"reward_per_episode": self.total_episode_reward})

                # if log interval has passed
                if self.episodes % self.locals["log_interval"] == 0:
                    # Save your model and optimizer
                    self.model.save(MODEL_SAVE_NAME)
                    # Save as artifact for version control.
                    artifact = wandb.Artifact(MODEL_SAVE_NAME, type='model')
                    artifact.add_file(MODEL_SAVE_NAME+".zip")
                    wandb.log_artifact(artifact)
                    wandb.log({"reward_per_episode": self.total_episode_reward})
                self.episodes += 1
                self.total_episode_reward = 0

        
        return True

    def _on_rollout_end(self) -> None:
        """
        This event is triggered before updating the policy.
        """
        pass

    def _on_training_end(self) -> None:
        """
        This event is triggered before exiting the learn() method.
        """
        pass

# Baseline environment

## Training

In [5]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=0
        )

In [6]:
num = 3

LOG_INTERVAL        = 50
WANDB_ID            = "" + str(num)
WNDB_NAME           = "Antoine" + str(num)
LOAD_SAVED_MODEL    = False
MODEL_SAVE_NAME     = "DQN_MODEL_" + str(num)
SAVED_MODEL_VERSION = "latest"
BUFFER_SIZE         = 150000
LEARNING_STARTS     = 3000

os.environ["WANDB_ENTITY"]  = "ant_ai"
os.environ["WANDB_PROJECT"] = "DQN_No_Obstacles"
os.environ["WANDB_RESUME"]  = "allow"
wandb.init(resume=WANDB_ID)
wandb.run.name = WNDB_NAME

env = make_vec_env(env, n_envs=1)

# Load model
if LOAD_SAVED_MODEL:
    try:
        model_artifact = wandb.use_artifact(MODEL_SAVE_NAME+':'+SAVED_MODEL_VERSION, type='model')
        artifact_dir = model_artifact.download()
        DQNmodel = DQN.load(artifact_dir+"/"+MODEL_SAVE_NAME, env=env)
        print("LOAD SAVED DQN MODEL")

    except:
        print("NO MODEL FOUND")
else:
    if 'DQNmodel' not in globals():
        DQNmodel = DQN(CnnPolicy, env, verbose=1, buffer_size=BUFFER_SIZE, learning_starts=LEARNING_STARTS)
        print("INITIALIZE NEW DQN MODEL")
    else:
        DQNmodel = DQN.load(MODEL_SAVE_NAME, env=env)
        print("CONTINUE DQN MODEL TRAINING")

        # Train model
DQNmodel.learn(total_timesteps=1000000, log_interval=LOG_INTERVAL, callback=DQNCustomCallback())
DQNmodel.save(MODEL_SAVE_NAME)
env.close()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mant_ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


INITIALIZE NEW DQN MODEL
--------------------------------------
| % time spent exploring  | 75       |
| episodes                | 50       |
| mean 100 episode reward | -141     |
| steps                   | 24777    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 44       |
| episodes                | 100      |
| mean 100 episode reward | -146     |
| steps                   | 56405    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 26       |
| episodes                | 150      |
| mean 100 episode reward | -137     |
| steps                   | 75208    |
--------------------------------------
--------------------------------------
| % time spent exploring  | 13       |
| episodes                | 200      |
| mean 100 episode reward | -119     |
| steps                   | 88372    |
--------------------------------------
--------------------------------------


KeyboardInterrupt: 

In [7]:
wandb.finish()

0,1
reward_per_episode,▄▆▁▂▃▅▅▅▄▅▅▅▆▆▆▆▆▆▆▅▅▇▆▆▆█▆▆▇▆▆▆▅▆▆▇▄▆▆▇

0,1
reward_per_episode,-105.6


## Evaluate model

In [2]:
env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0)
env = DummyVecEnv([env])

In [3]:
model = DQN.load('Model Weights/DQN/Baseline Environment Agent/DQN_MODEL_1-v50/DQN_MODEL_1',env=env)

model.set_env(env)

for episode in range(10):
    obs = env.reset()
    total_reward = 0
    done=False
    while not done:
        action, states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    print(f'Episode: {episode}, Total Reward: {total_reward}')
env.close()

Episode: 0, Total Reward: [-44.699608]
Episode: 1, Total Reward: [-105.7]
Episode: 2, Total Reward: [111.40096]
Episode: 3, Total Reward: [-105.7]
Episode: 4, Total Reward: [-105.7]
Episode: 5, Total Reward: [-58.999573]
Episode: 6, Total Reward: [-104.7]
Episode: 7, Total Reward: [-50.499493]
Episode: 8, Total Reward: [-96.60005]
Episode: 9, Total Reward: [-48.598885]


## Record Videos

In [2]:
'''
Complexity is set through 3 parameters: 
num_tracks: 1 for basic track, 2 for more complex tracks with intersection (X, t type)
num_lanes: number of lanes for the track 1 for normal, 2 for multiple lanes
num_lanes_changes: number of time two lanes will be merged into 1 over the track


Obstacles/Bonus are controled by 2 parameters
num_obstacles: number of obstacles by sub section of the track (if different than 0 then there are obstacles)
If the car touched an obstacle, 50 points are deducted from the score
prop_good_obstacles: probability (set between 0-1) for an obstacles to be a bonus (yellow color instead of red + reward 50 points if touched)
'''


env = lambda : CarRacing(
        grayscale=1,
        show_info_panel=0,
        discretize_actions="hard",
        frames_per_state=4,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0,
        num_obstacles=0)
env = DummyVecEnv([env])



In [3]:
model = DQN.load('Model Weights/DQN/Baseline Environment Agent/DQN_MODEL_1-v50/DQN_MODEL_1',env=env) # Baseline agent trained on the base environment

# Record the video starting at the first step
env = VecVideoRecorder(env, 'video/',
                       record_video_trigger=lambda x: x == 0, video_length=1500,
                       name_prefix="DQN_agent-based")

for episode in range(10):
    obs = env.reset()
    total_reward = 0
    done=False
    while not done:
        action, states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        total_reward += reward
        env.render()
    print(f'Episode: {episode}, Total Reward: {total_reward}')
env.close()


Episode: 0, Total Reward: [-105.7]
Episode: 1, Total Reward: [-86.70007]
Episode: 2, Total Reward: [-57.998413]
Episode: 3, Total Reward: [-106.09999]
Episode: 4, Total Reward: [-32.099487]
Episode: 5, Total Reward: [-105.7]
Episode: 6, Total Reward: [-105.7]
Episode: 7, Total Reward: [-107.2]
Episode: 8, Total Reward: [-105.29999]
Episode: 9, Total Reward: [-104.7]
