#### Setting up game configurations

In [1]:
from vizdoom import *
import random # for random actions
import time
import numpy as np

# Open AI Gym dependencies
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box # for random actions and nxm for random observation space (frames)
import cv2

# Stable Baselines3 dependencies
import os
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common import env_checker
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

# Optimizer
import optuna 

Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Game configuration : Take Cover configuration file - Level 1
config = 'github_vizdoom_repo/ViZDoom/scenarios/take_cover_lev1.cfg'

In [3]:
# Creating Vizdoom OpenAI Gym Environment
class VizDoomGym(Env):
    def __init__(self, render_mode = False, config = config): # By default, rendering is disabled
        # Inheriting from the Env class
        super().__init__()
        # Setup game
        self.game = DoomGame()
        self.game.load_config(config) 

        # Rendering mode : if unabled, the game will not be displayed but the training will be faster
        if render_mode == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        # Start the game
        self.game.init()
        
        # In order to get the game frame size, run a dummy demo and get the screen buffer shape  with game.get_state().screen_buffer.shape
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype = np.uint8)
        # Action space
        self.action_space = Discrete(2) # left, right
        self.current_step = 0

        # Strategy-based reward:
        # With only the movement reward the agent can only learn once it dies when shooted by a fireball
        # In order to make it learn while it plays, we add a small reward for dodging the fireballs
        self.damage_taken = 0
        self.hits_taken = 0
        self.dead = False


    # Defining how to make a step in the env
    def step(self, action):
        actions = np.identity(2, dtype=np.uint8) # Possible actions [left, right]
        living_reward = self.game.make_action(actions[action], 4) # Defyining the frame skip parameter to 4

        # Check if the frames are over
        reward = 0
        if self.game.get_state():
            state = self.game.get_state().screen_buffer
            obs = self.grayscale(state)

            # Performe reward shaping
            game_variables = self.game.get_state().game_variables
            health, damage_taken, hits_taken, dead = game_variables

            # Reward function
            damage_taken_delta = - damage_taken + self.damage_taken
            self.damage_taken = damage_taken
            hits_taken_delta = - hits_taken + self.hits_taken
            self.hits_taken = hits_taken
            if dead:
                death_peanalty = -100
            else:
                death_peanalty = 0

            lifespan_bonus = 0.01 * self.current_step

            # Combine the basic reward of the game with the shaped reward 
            reward = (living_reward*2 + damage_taken_delta*3 + hits_taken_delta*30 + death_peanalty + lifespan_bonus) / 100.0

            info = {"health": health,
                    "damage_taken" : damage_taken,
                    "hits_taken" : hits_taken,
                    'dead' : dead
                    }
            terminated = self.game.is_episode_finished()
            truncated = self.current_step >= 2100  # Max steps
        else: # Default zeros observation
            obs = np.zeros(self.observation_space.shape, dtype=np.uint8)
            terminated = True
            truncated = False
            info = {} 

        self.current_step += 1
        return obs, reward, terminated, truncated, info # Changed parameters order according to Gymnasium API
    
    def render():
        pass
    
    # What appens when starting a new episode
    def reset(self, seed = None, options = None):
        super().reset(seed=seed)
        if seed is not None:
            random.seed(seed)
            np.random.seed(seed)

        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        obs = self.grayscale(state)
        info = {"health": self.game.get_state().game_variables[0],
                "damage_taken" :0,
                "hits_taken" : 0,
                'dead' : False}
        
        self.current_step = 0

        return obs, info

    # Grayscale and resize the frames in order to reduce the observation space
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY) # moveaxis moves the first element (0) to last position (-1)
        resize = cv2.resize(gray, (160, 100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160, 1))
        return state

    def close(self):
        self.game.close()

In [4]:
# Check pass to see if the environment works
env_checker.check_env(env=VizDoomGym())

### Setting up Callbacks

In [5]:
class TrainAndLoggingCallback(BaseCallback):
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}.zip'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

In [6]:
CHECKPOINT_DIR = './train/train_TakeCover_RewardShaping'
OPT_DIR = './opt/opt_TakeCover_RewardShaping'
LOG_DIR = './logs/log_TakeCover_RewardShaping'

In [7]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path = CHECKPOINT_DIR)

### Hyperparameters Tuning with Optuna

In [8]:
# Objective function to test hyperparameters
def optimize_ppo(trial):
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192, step = 64), # number of frames used in a single training step (must be multiple of 64)
        'gamma': trial.suggest_float('gamma', 0.8, 0.9999, log = True), # discount factor
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 1e-4, log = True),
        'clip_range': trial.suggest_float('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_float('gae_lambda', 0.8, 0.99)
    }

In [9]:
# Run a training loop and return the mean reward
def optimize_agent(trial):
    try:
        model_params = optimize_ppo(trial) # set of hyperparameters to test

        # Create new env
        env = VizDoomGym(config=config)
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')

        # Set up, train and evaluatethe PPO model
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, **model_params)
        model.learn(total_timesteps=100000)
        
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        # Save the best model 
        SAVE_PATH = os.path.join(OPT_DIR, 'trial_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)

        return mean_reward


    except Exception as e:
        print(e)
        return -1000 # Return a very low reward if training fails

In [10]:
# Begin an optuna study
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=-1, show_progress_bar = True)

[I 2026-01-03 16:27:14,074] A new study created in memory with name: no-name-7bc8a393-0f5a-44c1-91a8-c72984ac60f3
Best trial: 0. Best value: 19.1826:  10%|█         | 1/10 [1:37:11<14:34:44, 5831.62s/it]

[I 2026-01-03 18:04:25,649] Trial 0 finished with value: 19.182560000000002 and parameters: {'n_steps': 3456, 'gamma': 0.8840423020544265, 'learning_rate': 5.539541186344991e-05, 'clip_range': 0.3667349691600259, 'gae_lambda': 0.928852442517744}. Best is trial 0 with value: 19.182560000000002.


Best trial: 4. Best value: 19.2889:  20%|██        | 2/10 [1:37:35<5:22:03, 2415.49s/it] 

[I 2026-01-03 18:04:49,883] Trial 4 finished with value: 19.28886 and parameters: {'n_steps': 4032, 'gamma': 0.9225613914691654, 'learning_rate': 1.763510426648654e-05, 'clip_range': 0.39380015046530303, 'gae_lambda': 0.9097655301451003}. Best is trial 4 with value: 19.28886.


Best trial: 5. Best value: 21.4696:  30%|███       | 3/10 [1:37:54<2:34:06, 1320.91s/it]

[I 2026-01-03 18:05:08,249] Trial 5 finished with value: 21.469559999999998 and parameters: {'n_steps': 2240, 'gamma': 0.9579173855647211, 'learning_rate': 5.712170885964005e-05, 'clip_range': 0.2739558996813305, 'gae_lambda': 0.8672681161145809}. Best is trial 5 with value: 21.469559999999998.


Best trial: 5. Best value: 21.4696:  40%|████      | 4/10 [1:38:03<1:20:19, 803.27s/it] 

[I 2026-01-03 18:05:17,979] Trial 7 finished with value: 14.806320000000003 and parameters: {'n_steps': 5952, 'gamma': 0.9674897174514736, 'learning_rate': 3.361033246483096e-05, 'clip_range': 0.350663680120891, 'gae_lambda': 0.8610663489530634}. Best is trial 5 with value: 21.469559999999998.


Best trial: 6. Best value: 28.9046:  50%|█████     | 5/10 [1:38:27<43:29, 521.94s/it]  

[I 2026-01-03 18:05:41,100] Trial 6 finished with value: 28.90458 and parameters: {'n_steps': 4608, 'gamma': 0.9408444835113065, 'learning_rate': 2.60674944816054e-05, 'clip_range': 0.2680823697885537, 'gae_lambda': 0.8710076704861864}. Best is trial 6 with value: 28.90458.


Best trial: 9. Best value: 34.5727:  60%|██████    | 6/10 [1:38:57<23:38, 354.68s/it]

[I 2026-01-03 18:06:11,097] Trial 9 finished with value: 34.57268 and parameters: {'n_steps': 4096, 'gamma': 0.9815629920783245, 'learning_rate': 3.185875331863637e-05, 'clip_range': 0.10692545774744526, 'gae_lambda': 0.873701388032408}. Best is trial 9 with value: 34.57268.


Best trial: 9. Best value: 34.5727:  70%|███████   | 7/10 [1:39:15<12:14, 244.67s/it]

[I 2026-01-03 18:06:29,273] Trial 8 finished with value: 24.29858 and parameters: {'n_steps': 5440, 'gamma': 0.9001434620593336, 'learning_rate': 7.6670735749366e-05, 'clip_range': 0.32928647352337914, 'gae_lambda': 0.9452799343431608}. Best is trial 9 with value: 34.57268.


Best trial: 9. Best value: 34.5727:  80%|████████  | 8/10 [1:39:44<05:51, 175.96s/it]

[I 2026-01-03 18:06:58,110] Trial 2 finished with value: 31.43582 and parameters: {'n_steps': 8128, 'gamma': 0.9604606262970521, 'learning_rate': 8.806788223370525e-05, 'clip_range': 0.11252556706285016, 'gae_lambda': 0.9403019200858607}. Best is trial 9 with value: 34.57268.


Best trial: 9. Best value: 34.5727:  90%|█████████ | 9/10 [1:39:59<02:05, 125.67s/it]

[I 2026-01-03 18:07:13,225] Trial 1 finished with value: 7.8185199999999995 and parameters: {'n_steps': 7680, 'gamma': 0.9278481584560578, 'learning_rate': 1.4259807419247515e-05, 'clip_range': 0.2378956258940986, 'gae_lambda': 0.9291356274606357}. Best is trial 9 with value: 34.57268.


Best trial: 9. Best value: 34.5727: 100%|██████████| 10/10 [1:40:03<00:00, 600.34s/it]

[I 2026-01-03 18:07:17,442] Trial 3 finished with value: 10.164260000000002 and parameters: {'n_steps': 7680, 'gamma': 0.8028647457232007, 'learning_rate': 3.59643075134149e-05, 'clip_range': 0.20265939395490346, 'gae_lambda': 0.8559842296593158}. Best is trial 9 with value: 34.57268.





In [11]:
# Print the best hyperparameters
print("Best hyperparameters combo: ", study.best_params)

# Saving best study results into csv file
with open('./best_hyperparameters_TakeCover_RewardShaping.csv', 'w') as f:
    for key, value in study.best_params.items():
        f.write(f"{key},{value}\n")

Best hyperparameters combo:  {'n_steps': 4096, 'gamma': 0.9815629920783245, 'learning_rate': 3.185875331863637e-05, 'clip_range': 0.10692545774744526, 'gae_lambda': 0.873701388032408}


### Fine-Tuning the best Agent

In [12]:
env = VizDoomGym(config=config)

In [13]:
# instantiate PPO model
model = PPO('CnnPolicy', # policy type -> CnnPolicy since we are working on image frames
            env,
            tensorboard_log=LOG_DIR,
            verbose=1,
            learning_rate=study.best_params['learning_rate'],
            n_steps=study.best_params['n_steps'], 
            clip_range=study.best_params['clip_range'],
            gamma= study.best_params['gamma'],
            gae_lambda=study.best_params['gae_lambda']
)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [14]:
model.learn(total_timesteps = 400000, callback=callback)

Logging to ./logs/log_TakeCover_RewardShaping/PPO_12
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 115      |
|    ep_rew_mean     | 9.79     |
| time/              |          |
|    fps             | 171      |
|    iterations      | 1        |
|    time_elapsed    | 23       |
|    total_timesteps | 4096     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 108           |
|    ep_rew_mean          | 9.21          |
| time/                   |               |
|    fps                  | 148           |
|    iterations           | 2             |
|    time_elapsed         | 55            |
|    total_timesteps      | 8192          |
| train/                  |               |
|    approx_kl            | 0.00041768063 |
|    clip_fraction        | 0.00227       |
|    clip_range           | 0.107         |
|    entropy_loss         | -0.693 

<stable_baselines3.ppo.ppo.PPO at 0x7f9441baf9a0>

### Curriculum Learning

##### Level 2

In [None]:
# Load the best model in the previous environment (Level 1)
model.load("./train/train_TakeCover_RewardShaping/best_model_400000.zip") 

In [None]:
# Load level 2 configuration
env = VizDoomGym(config='github_vizdoom_repo/ViZDoom/scenarios/take_cover_lev2.cfg')
model.set_env(env)

# Retrain the model 
model.learn(total_timesteps = 400000, callback=callback)

##### Level 3

In [None]:
# Load best level 2 model
model.load("./train/train_TakeCover_RewardShaping/best_model_500000.zip") 

# Load level 3 configuration
env = VizDoomGym(config='github_vizdoom_repo/ViZDoom/scenarios/take_cover_lev3.cfg')
model.set_env(env)

# Retrain the model 
model.learn(total_timesteps = 250000, callback=callback)

##### Level 4

In [None]:
# Load best level 3 model
model.load("./train/train_TakeCover_RewardShaping/best_model_670000.zip") 

# Load level 4 configuration
env = VizDoomGym(config='github_vizdoom_repo/ViZDoom/scenarios/take_cover_lev4.cfg')
model.set_env(env)

# Retrain the model 
model.learn(total_timesteps = 250000, callback=callback)

### Testing the trained agent on real-time game

In [None]:
# Reload the best model from disk
BEST_MODEL_DIR = "./train/train_TakeCover_RewardShaping/best_model_780000.zip"
model = PPO.load(BEST_MODEL_DIR) 

In [None]:
env = VizDoomGym(render_mode= True, config="github_vizdoom_repo/ViZDoom/scenarios/take_cover_lev1.cfg") # rendered-env

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=50)
print(f"Mean reward over 50 episodes: {mean_reward}")

In [None]:
env.close()