### Introduction
The purpose of this notebook is to train a new DQN using the stable_baselines3 framework on the CarRacing-v2 Gym environment.

In [1]:
!apt-get install -y swig
!pip install box2d-py
!pip install 'gymnasium[box2d]'
!pip install 'stable-baselines3[extra]'
!pip install toml
!pip install wandb

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 19 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 2s (604 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 120874 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubunt

In [1]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import torch
import  gymnasium as gym

import glob
import io
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
env = gym.make('CarRacing-v2', continuous=False)
print("Observation space: ", env.observation_space)
print("Action space: ", env.action_space)

Observation space:  Box(0, 255, (96, 96, 3), uint8)
Action space:  Discrete(5)


In [3]:
s, info = env.reset()
for _ in range(50): #skip 50 steps
    act = env.action_space.sample()
    s, _, _, _, _ = env.step(act)
print(s.shape)

(96, 96, 3)


In [4]:
import toml

# TOML-formatted string
config_toml = """
PREFIX              = 'DQN_BASE'
NUM_OF_STEPS        = 5000
NUM_OF_EPISODES     = 1000
LOG_INTERVAL        = 50
BUFFER_SIZE         = 150000
LEARNING_STARTS     = 5000
"""

config = toml.loads(config_toml)

LOAD_SAVED_MODEL    = False

RUN_NUM = 15
WANDB_ID            = "dql_rl_"+str(RUN_NUM)
WNDB_NAME           = "DQL_RL_"+str(RUN_NUM)
MODEL_SAVE_NAME     = WNDB_NAME
SAVED_MODEL_VERSION = "latest"

import os

os.environ['WANDB_NOTEBOOK_NAME'] = 'DQN_N.ipynb'
import wandb
wandb.login()
wandb.init(resume=WANDB_ID,
           entity="yakiv",
            project="CarRacingDT",
            #resume= "allow"
            config=config
           )
wandb.run.name = WNDB_NAME


[34m[1mwandb[0m: Currently logged in as: [33myakiv[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
from stable_baselines3.common.callbacks import BaseCallback

class DQNCustomCallback(BaseCallback):
    """
    A custom callback that derives from ``BaseCallback``.

    :param verbose: (int) Verbosity level 0: not output 1: info 2: debug
    """
    def __init__(self, verbose=0):
        super(DQNCustomCallback, self).__init__(verbose)
        self.episodes = 0
        self.total_episode_reward = 0

    def _on_training_start(self) -> None:
        pass

    def _on_rollout_start(self) -> None:
        pass

    def _on_step(self) -> bool:
        # update commulative reward to log at the end of every episode
        self.total_episode_reward += self.locals["rewards"].mean()
        # at the end of every episode
        if self.locals["dones"][0].item():

            # log the reward value if its time to not log 2 times
            if self.episodes % self.locals["log_interval"] != 0:
                wandb.log({"reward_per_episode": self.total_episode_reward})

            # if log interval has passed
            if self.episodes % self.locals["log_interval"] == 0:
                # log at wandb
                # Save your model and optimizer
                self.model.save(MODEL_SAVE_NAME)
                # Save as artifact for version control.
                artifact = wandb.Artifact(MODEL_SAVE_NAME, type='model')
                artifact.add_file(MODEL_SAVE_NAME+".zip")
                wandb.log_artifact(artifact)
                wandb.log({"reward_per_episode": self.total_episode_reward})

            self.episodes += 1
            self.total_episode_reward = 0
        return True

    def _on_rollout_end(self) -> None:
        pass

    def _on_training_end(self) -> None:
        pass

In [6]:
from stable_baselines3 import DQN
from stable_baselines3.dqn import CnnPolicy


# Load model
if LOAD_SAVED_MODEL:
    try:
        model_artifact = wandb.use_artifact(MODEL_SAVE_NAME+':'+SAVED_MODEL_VERSION, type='model')
        artifact_dir = model_artifact.download()
        DQNmodel = DQN.load(artifact_dir+"/"+MODEL_SAVE_NAME, env=env, tensorboard_log="./tensorboard/")
        print("LOAD SAVED DQN MODEL")
    except:
        print("NO MODEL FOUND")
else:
    if True: #'DQNmodel' not in globals():
        DQNmodel = DQN(CnnPolicy, env, verbose=1, buffer_size=config["BUFFER_SIZE"], learning_starts=config["LEARNING_STARTS"],  tensorboard_log="./tensorboard/")
        print("INITIALIZE NEW DQN MODEL")
    else:
        DQNmodel = DQN.load(MODEL_SAVE_NAME, env=env)
        print("CONTINUE DQN MODEL TRAINING")



# Train model
DQNmodel.learn(total_timesteps=config["NUM_OF_STEPS"]*config["NUM_OF_EPISODES"], log_interval=config["LOG_INTERVAL"], callback=DQNCustomCallback())

# Save model
DQNmodel.save(config['prefix'] + "_model")



Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


  from .autonotebook import tqdm as notebook_tqdm


INITIALIZE NEW DQN MODEL
Logging to ./tensorboard/DQN_1


KeyboardInterrupt: 

## Evaluate our DQN model

In [3]:
from stable_baselines3 import DQN
import gymnasium as gym
from stable_baselines3 import DQN

import wandb
import random
def evaluate_version(model, env, model_save_name, model_save_version, video_path, epsilon=0.0):
    # get version of model

    tmp_model_path ='/Users/jacob/Documents/T/GCR/artifacts/DQL_RL_11:v41/DQL_RL_11.zip'
    loaded_model = model.load(tmp_model_path)
    # wandb.init(entity="yakiv",
    #         project="CarRacingDT",
    #         resume= "allow")
    # model_artifact = wandb.use_artifact(model_save_name+':'+model_save_version, type='model')
    # artifact_dir = model_artifact.download()
    # loaded_model = model.load(artifact_dir+"/"+model_save_name)


    # play model
    obs = env.reset()
    if len(obs) == 2:
        obs = obs[0]
    done = False
    sti = 0
    total_reward = 0
    while not done:
        sti = sti + 1
        if random.random() < epsilon:
            action = 3# env.action_space.sample()
        else:
            action, _states = loaded_model.predict(obs,deterministic=True)
        obs, reward, done, t, i = env.step(action)
        total_reward = total_reward + reward
        print(f"Step {sti} Total: {total_reward} Step: {reward} Done: {done} ")
        env.render()
    env.close()


env =  gym.make('CarRacing-v2', continuous=False, render_mode='human')
evaluate_version(DQN, env, 'DQL_RL_11', 'latest', "", epsilon=0.4)




Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>
Exception: Can't get attribute '_make_function' on <module 'cloudpickle.cloudpickle' from '/Users/jacob/.pyenv/versions/3.10.12/lib/python3.10/site-packages/cloudpickle/cloudpickle.py'>
  from .autonotebook import tqdm as notebook_tqdm


CnnPolicy(
  (q_net): QNetwork(
    (features_extractor): NatureCNN(
      (cnn): Sequential(
        (0): Conv2d(3, 32, kernel_size=(8, 8), stride=(4, 4))
        (1): ReLU()
        (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
        (3): ReLU()
        (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
        (5): ReLU()
        (6): Flatten(start_dim=1, end_dim=-1)
      )
      (linear): Sequential(
        (0): Linear(in_features=4096, out_features=512, bias=True)
        (1): ReLU()
      )
    )
    (q_net): Sequential(
      (0): Linear(in_features=512, out_features=5, bias=True)
    )
  )
  (q_net_target): QNetwork(
    (features_extractor): NatureCNN(
      (cnn): Sequential(
        (0): Conv2d(3, 32, kernel_size=(8, 8), stride=(4, 4))
        (1): ReLU()
        (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
        (3): ReLU()
        (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
        (5): ReLU()
        (6): Flatten(start_dim=1, end_di