# Reinforcement Learning on Doom(Vizdoom) with stable baselines3

## Create GYM Environment and import libraries

In [1]:
!pip install gym
!pip install stable-baselines3[extra]
!pip install vizdoom
import time
from vizdoom import *
import gym
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import cv2
import os
import torch
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback

'pip' is not recognized as an internal or external command,
operable program or batch file.
'pip' is not recognized as an internal or external command,
operable program or batch file.
'pip' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
class doomGym(Env):
    def __init__(self, render=False, config='deadly_corridor1.cfg'):
        super().__init__()
        self.game = DoomGame()
        self.game.load_config(config)

        match render:
            case False:
                self.game.set_window_visible(False)
            case True:
                self.game.set_window_visible(True)

        self.game.init()

        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8)
        self.action_space = Discrete(7)

        self.kc = 0.0
        self.ammo = 52.0
        self.health = 100.0

    def step(self, action):
        actions = np.identity(7)
        movement_reward = self.game.make_action(actions[action], 4)
        reward = 0.0

        if self.game.get_state():
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            game_variables = self.game.get_state().game_variables
            health, kc, ammo = game_variables
            health_delta = health - self.health
            self.health = health
            kc_delta = kc - self.kc
            self.kc = kc
            ammo_delta = ammo - self.ammo
            self.ammo = ammo
            health_delta = health_delta if health_delta<0.1 else 0.0
            kc_delta = kc_delta if kc_delta>0.1 else 0.0
            ammo_delta = ammo_delta if ammo_delta<0.1 else 0.0
            reward = movement_reward*2.25 + health_delta*20.0 + kc_delta*300.0  + ammo_delta*1.25
            info = ammo
        else:
            state = np.zeros(self.observation_space.shape)
            reward = 0.0
            info = 0.0


        info = {"info":info}
        done = self.game.is_episode_finished()

        return state, reward, done, info

    def render(self, mode="human"):
        pass

    def reset(self):
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)

    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state


    def close(self):
        self.game.close()

## Create Callback

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [6]:
MODEL_DIR = './train/models'
LOG_DIR = './logs/'

In [7]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=MODEL_DIR)

## Training

### Set difficulty

In [8]:
d1 = 'deadly_corridor1.cfg'
d2 = 'deadly_corridor2.cfg'
d3 = 'deadly_corridor3.cfg'
d4 = 'deadly_corridor4.cfg'
d5 = 'deadly_corridor5.cfg'

### Train Model

#### Load Pretrained Model

In [11]:
model = PPO.load('train/train_corridor/bm_d4_5.zip')

#### Create model(if you already loaded model pass this cell)

In [None]:
# env = doomGym(config=d1)
# model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.00001, n_steps=8192, clip_range=0.1, gamma=0.95, gae_lambda=0.9)

#### Train Model

In [None]:
# env = doomGym(config=d1)
# model.set_env(env)
# model.learn(total_timesteps=200000, callback=callback)
# env.close()

In [None]:
# env = doomGym(config=d2)
# model.set_env(env)
# model.learn(total_timesteps=60000, callback=callback)
# env.close()

In [None]:
# env = doomGym(config=d3)
# model.set_env(env)
# model.learn(total_timesteps=60000, callback=callback)
# env.close()

In [None]:
# env = doomGym(config=d4)
# model.set_env(env)
# model.learn(total_timesteps=1000000, callback=callback)
# env.close()

Don't run cell below without saving your best d4 model. Model performs better on d5 if it is not trained on d5.

In [29]:
# env = doomGym(config=d5)
# model.set_env(env)
# model.learn(total_timesteps=500000, callback=callback)
# env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/logs_corridor\PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 87.1     |
|    ep_rew_mean     | 4.5e+03  |
| time/              |          |
|    fps             | 15       |
|    iterations      | 1        |
|    time_elapsed    | 519      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 86.9        |
|    ep_rew_mean          | 4.43e+03    |
| time/                   |             |
|    fps                  | 14          |
|    iterations           | 2           |
|    time_elapsed         | 1093        |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.007915325 |
|    clip_fraction        | 0.0882      |


<stable_baselines3.ppo.ppo.PPO at 0x197312033d0>

## Test Model

In [14]:
model = PPO.load('train/train_corridor/bm_d4_5.zip')

In [9]:
env = doomGym(render=True, config=d5)
env.reset()

array([[[32],
        [33],
        [25],
        ...,
        [27],
        [23],
        [24]],

       [[27],
        [33],
        [23],
        ...,
        [24],
        [24],
        [24]],

       [[20],
        [35],
        [23],
        ...,
        [24],
        [24],
        [24]],

       ...,

       [[75],
        [63],
        [62],
        ...,
        [44],
        [71],
        [60]],

       [[15],
        [48],
        [47],
        ...,
        [49],
        [69],
        [47]],

       [[22],
        [14],
        [26],
        ...,
        [57],
        [37],
        [39]]], dtype=uint8)

In [10]:
evaluate_policy(model, env, n_eval_episodes=20)



KeyboardInterrupt: 