# setup Vizdoom

In [1]:
!pip install vizdoom

Collecting vizdoom
  Downloading vizdoom-1.1.13-cp38-cp38-win_amd64.whl (15.4 MB)
Installing collected packages: vizdoom
Successfully installed vizdoom-1.1.13


In [1]:
# Import vizdoom for game env
from vizdoom import * 
# Import random for action sampling
import random
# Import time for sleeping
import time 
# Import numpy for identity matrix(action have only 3 value)
import numpy as np

In [2]:
# Setup game
game = DoomGame()
game.load_config('github/VizDoom/scenarios/basic.cfg')
game.init()

In [3]:
# This is the set of actions we can take in the environment
actions = np.identity(3, dtype=np.uint8)

In [4]:
# Loop through episodes 
episodes = 10 
for episode in range(episodes): 
    # Create a new episode or game 
    game.new_episode()
    # Check the game isn't done 
    while not game.is_episode_finished(): 
        # Get the game state 
        state = game.get_state()
        # Get the game image 
        img = state.screen_buffer
        # Get the game variables - ammo
        info = state.game_variables
        # Take an action and skip 4 frame
        reward = game.make_action(random.choice(actions),4)
        # Print rewward 
        print('reward:', reward) 
        time.sleep(0.02)
    print('Result:', game.get_total_reward())
    time.sleep(2)

reward: -4.0
reward: 99.0
Result: 95.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0

In [5]:
game.close()

# Setup the gym

In [6]:
!pip install gym



In [8]:
!pip install opencv-python

  and should_run_async(code)


Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl (35.6 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.6.0.66


In [2]:
#import env base
from gym import Env
#import gym space
from gym.spaces import Discrete,Box
#import opencv
import cv2

In [3]:
# Create Vizdoom OpenAI Gym Environment
class VizDoomGym(Env): 
    # Function that is called when we start the env
    def __init__(self, render=False): 
        # Inherit from Env
        super().__init__()
        # Setup the game 
        self.game = DoomGame()
        self.game.load_config('github/VizDoom/scenarios/basic.cfg')
        
        # Render frame logic
        if render == False: 
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game 
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        self.action_space = Discrete(3)
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(3)
        reward = self.game.make_action(actions[action], 4) 
        
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Define how to render the game or environment 
    def render(): 
        pass
    
    # What happens when we start a new game 
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [7]:
env=VizDoomGym()

In [9]:
env.close()

In [4]:
#check for a valid env
from stable_baselines3.common import env_checker

In [8]:
env_checker.check_env(env)

# Setup callback

In [3]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu113/torchvision-0.13.1%2Bcu113-cp38-cp38-win_amd64.whl (4.7 MB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu113/torchaudio-0.12.1%2Bcu113-cp38-cp38-win_amd64.whl (1.2 MB)
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-0.12.1+cu113 torchvision-0.13.1+cu113


In [2]:
#so we can use ppo 
!pip install stable-baselines3[extra]



In [4]:
# Import os for file nav
import os 
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = './train/train_basic'
LOG_DIR = './logs/log_basic'

In [8]:
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR)

# Train with PPO

In [7]:
#import ppo
from stable_baselines3 import PPO

In [19]:
!pip install tensorflow




In [29]:
#non render env
env= VizDoomGym()

In [40]:
#cnnpolicy for img
#verbose give some info
model= PPO('CnnPolicy',env,tensorboard_log=LOG_DIR,verbose=1,learning_rate=0.0001,n_steps=2048)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [41]:
model.learn(total_timesteps=100000,callback=callback)

Logging to ./logs/log_basic\PPO_9
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30.3     |
|    ep_rew_mean     | -71.2    |
| time/              |          |
|    fps             | 25       |
|    iterations      | 1        |
|    time_elapsed    | 80       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.2        |
|    ep_rew_mean          | -45.6       |
| time/                   |             |
|    fps                  | 16          |
|    iterations           | 2           |
|    time_elapsed         | 244         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010885042 |
|    clip_fraction        | 0.0888      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000267  

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 19.7      |
|    ep_rew_mean          | -5.95     |
| time/                   |           |
|    fps                  | 12        |
|    iterations           | 11        |
|    time_elapsed         | 1844      |
|    total_timesteps      | 22528     |
| train/                  |           |
|    approx_kl            | 0.0298495 |
|    clip_fraction        | 0.383     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.866    |
|    explained_variance   | 0.572     |
|    learning_rate        | 0.0001    |
|    loss                 | 1.27e+03  |
|    n_updates            | 100       |
|    policy_gradient_loss | 0.0347    |
|    value_loss           | 2.53e+03  |
---------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 14.8       |
|    ep_rew_mean          | 25.9     

KeyboardInterrupt: 

#### PPO3 is 2046 n_steps learn_rate is .0001 best_model_60000
#### PPO8 is 2048 n_steps(im stupid) learn rate .001 best model 12k
#### PPO9  is 2048  "" learn rate .0001 best model 16k

approx_kl:ppo look the old agent for the new one,this measure how much those 2 agents are different,if we got a spike in this value we have huge divergence, the next value "clip" its how ppo solve this problem also we can set hyperparamtre like clip_range & gae_lambda if we have to much spike.
explained_variance: we want this to be positive cus it tells how the critic module is predicting
policy_gradient_loss: if goes to 0 the model isnt learning and is doing the same thing

tensorboard --logdir=. shell comand from the folder where the training data is saved for looking the graphs

data whe found:
###### ep_lenght_mean=how much we live
##### ep_reward_mean=reward
##### experience_variance= we want to improve this value
##### value_loss= we want near 0
##### policy_gradient_lost=if drop to 0 to fast the model isnt learning,
##### approx_kl= same as policy_grad if near 0 the agent is takin same action

# Test the model

In [9]:
#import eval policy
from stable_baselines3.common.evaluation import evaluate_policy

In [10]:
modeltest= PPO.load('./train/train_basic/best_model_160000')

In [11]:
env=VizDoomGym(render=True)

In [12]:
#mean reward for 10 games 
mean_rew, _ = evaluate_policy(modeltest,env,n_eval_episodes=10)



In [13]:
for episode in range(5): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = modeltest.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

Total Reward for episode 0 is 13.0
Total Reward for episode 1 is 95.0
Total Reward for episode 2 is 60.0
Total Reward for episode 3 is 95.0
Total Reward for episode 4 is -3.0


In [14]:
env.close()