# setup Vizdoom

In [1]:
!pip install vizdoom

Collecting vizdoom
  Downloading vizdoom-1.1.13-cp38-cp38-win_amd64.whl (15.4 MB)
Installing collected packages: vizdoom
Successfully installed vizdoom-1.1.13


In [1]:
# Import vizdoom for game env
from vizdoom import * 
# Import random for action sampling
import random
# Import time for sleeping
import time 
# Import numpy for identity matrix(action have only 3 value)
import numpy as np

# Setup the gym

In [6]:
!pip install gym



In [8]:
!pip install opencv-python

  and should_run_async(code)


Collecting opencv-python
  Downloading opencv_python-4.6.0.66-cp36-abi3-win_amd64.whl (35.6 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.6.0.66


In [2]:
#import env base
from gym import Env
#import gym space
from gym.spaces import Discrete,Box
#import opencv
import cv2

In [3]:
# Create Vizdoom OpenAI Gym Environment
class VizDoomGym(Env): 
    # Function that is called when we start the env
    def __init__(self, render=False,config='github/VizDoom/scenarios/deadly_corridor_s1.cfg'): 
        # Inherit from Env
        super().__init__()
        # Setup the game 
        self.game = DoomGame()
        self.game.load_config(config)
        
        # Render frame logic
        if render == False: 
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game 
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        self.action_space = Discrete(7)
        
        # Game variables: HEALTH DAMAGE_TAKEN HITCOUNT SELECTED_WEAPON_AMMO 
        #for reward shaping
        self.damage_taken=0
        self.hitcount=0
        self.ammo=52
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(7)
        movement_reward = self.game.make_action(actions[action], 4) 
        reward=0
        
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            #reward shaping 
            game_variables=self.game.get_state().game_variables
            health,damage_taken,hitcount,ammo=game_variables
            #delta var
            damage_taken_delta= -damage_taken+self.damage_taken
            self.damage_taken=damage_taken
            hitcount_delta=hitcount-self.hitcount
            self.hitcount=hitcount
            ammo_delta=ammo-self.ammo
            self.ammo=ammo 
            
            reward=movement_reward+damage_taken_delta*20 + hitcount_delta*350 + ammo_delta*5            
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Define how to render the game or environment 
    def render(): 
        pass
    
    # What happens when we start a new game 
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [39]:
env=VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor.cfg')

In [41]:
env.close()

In [17]:
#check for a valid env
from stable_baselines3.common import env_checker

In [40]:
env_checker.check_env(env)

# Setup callback

In [3]:
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu113
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu113/torchvision-0.13.1%2Bcu113-cp38-cp38-win_amd64.whl (4.7 MB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu113/torchaudio-0.12.1%2Bcu113-cp38-cp38-win_amd64.whl (1.2 MB)
Installing collected packages: torchvision, torchaudio
Successfully installed torchaudio-0.12.1+cu113 torchvision-0.13.1+cu113


In [2]:
#so we can use ppo 
!pip install stable-baselines3[extra]



In [4]:
# Import os for file nav
import os 
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback

In [5]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [6]:
CHECKPOINT_DIR = './train/train_deadly'
LOG_DIR = './logs/log_deadly'
CHECKPOINT_DIR_ONLINE = './train/train_deadly_online'
LOG_DIR_ONLINE = './logs/log_deadly_online'
CHECKPOINT_DIR2 = './train/train_deadly2'
LOG_DIR2 = './logs/log_deadly2'
CHECKPOINT_DIR3 = './train/train_deadly3'
LOG_DIR3 = './logs/log_deadly3'
CHECKPOINT_DIR4 = './train/train_deadly4'
LOG_DIR4 = './logs/log_deadly4'
CHECKPOINT_DIR5 = './train/train_deadly5'
LOG_DIR5 = './logs/log_deadly5'

In [8]:
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR)

# Train with PPO

In [7]:
#import ppo
from stable_baselines3 import PPO

In [19]:
!pip install tensorflow




In [37]:
#non render env
env= VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor_s1.cfg')

In [38]:
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR_ONLINE)

In [37]:
#cnnpolicy for img
#verbose give some info
#n_steps tootal of time frame passed for 1 traing round, so higher = more info passed
#model= PPO('CnnPolicy',env,tensorboard_log=LOG_DIR_ONLINE,verbose=1,learning_rate=0.00001,n_steps=8192)
model= PPO('CnnPolicy',env,tensorboard_log=LOG_DIR_ONLINE,verbose=1,learning_rate=0.00001,n_steps=8192, clip_range=.1, gamma=.95, gae_lambda=.9)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [39]:
model= PPO.load('./train/train_deadly_online/best_model_590kpp14')
model.set_env(env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [40]:
model.learn(total_timesteps=600000,callback=callback)

Logging to ./logs/log_deadly_online\PPO_17
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 134      |
|    ep_rew_mean     | 896      |
| time/              |          |
|    fps             | 30       |
|    iterations      | 1        |
|    time_elapsed    | 268      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 128         |
|    ep_rew_mean          | 918         |
| time/                   |             |
|    fps                  | 19          |
|    iterations           | 2           |
|    time_elapsed         | 840         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.007732519 |
|    clip_fraction        | 0.259       |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.11       |
|    explained_variance   | 0

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 128          |
|    ep_rew_mean          | 977          |
| time/                   |              |
|    fps                  | 12           |
|    iterations           | 11           |
|    time_elapsed         | 6962         |
|    total_timesteps      | 90112        |
| train/                  |              |
|    approx_kl            | 0.0059568407 |
|    clip_fraction        | 0.236        |
|    clip_range           | 0.1          |
|    entropy_loss         | -1.02        |
|    explained_variance   | 0.466        |
|    learning_rate        | 1e-05        |
|    loss                 | 1.57e+04     |
|    n_updates            | 820          |
|    policy_gradient_loss | 0.003        |
|    value_loss           | 3.09e+04     |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

KeyboardInterrupt: 

# Difficulty 1 Test effetuati da me
ricorda che se ep_lengh troppo bassa l'agente potrebbe star correndo dritto che buono per dif 1 ma per il resto è overfitted
### Test con 40000 timesteps
####  ppo8 n_steps 4096 l_rate .0001 best model 110k
len sembra bassa dai dati.Infatti dai test è palese che l'agente vada dritto ignorando i nemici
#### ppo9 n_step 8192 l_rate .0001 best model 150k
prova gia a sparare ai nemici ma non va molto avanti
#### ppo10 n-step 8192 l_rate .00001 best model 190k
cerca di uccidere i due nemici ma poi non va avanti, provare a riprendere ppo10 e portartalo a piu timesteps
### Partendo da PP010 best_model con 80000 timesteps
#### ppo11 stesse stat best model 50k o 80k
visto che intorno a 55k timesteps il tempo diventa troppo basso bisogna testare 2 modelli: il migliore per reward e quello che i dati danno con buona reward e un buon tempo.Dai test in qualche modo sono peggiori di ppo10


# Test con iperparametri trovati online
#### ppo3 addestrato con 400k timesteps nst model 430k va troppo giu il tempo medio quindi ri addestrare
#### pp4 a partire da 10k best model di ppo3 miglriore ma dopo circa 200k va anche lui troppo basso
#### pp7 a partire da 180k di pp4 va sempre troppo giu il tempo bisogna riprovare da capo
#### pp10 aumentata reward hitcount 300
#### pp12 aumentata anche dmg taken a 50 secondo me migliore riprovare con ancora di 
#### ppo13 dmg taken 100, troppo alto l'agente si mette in fondo mappa e gira su se stesso (come non chiedere)
#### ppo14 20,350,5  (350k)avanza e uccide i nemi a sinistra finalmente un buon set di perparametri,(450k) entra seconda zona facile ma poi si ferma a volte torna anche indietro,(490) uccide subito un nemico va avanti e uccide un nemico della seconda zona,(550k) ci stiamo avviacinando molto capita che uccide quasi tutti e arriva nella zona finale spesso,(590k) molto aggressivo nel andare avanti e una vittoria durante il testing 
#### ppo15 riprendo da 590k pp14,previsto solo altri 100k ~ passi di addestramento(da rifare mi sa che uso funzione sbagliata)
#### ppo16 come 15 ma con ppo.load.partono con +590k, non sembra che parta da pp14 ma neanche da zero non capisco .... errore stupido ho messo modeltest=ppo.load al posto di model......(cio rende ancora piu strano che sembri non partire da zero)
#### ppo17 stavolta model=ppo.load adesso ha continuato il training ,su molti modelli vince almeno 1 volta su 5 


a

# curriculum

In [9]:
env = VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor_s2.cfg')
model= PPO.load('./train/train_deadly_online/best_model_700k')
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR2)
model.set_env(env)
model.learn(total_timesteps=40000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_online\PPO_18
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 81.1     |
|    ep_rew_mean     | 805      |
| time/              |          |
|    fps             | 27       |
|    iterations      | 1        |
|    time_elapsed    | 297      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 78.7        |
|    ep_rew_mean          | 850         |
| time/                   |             |
|    fps                  | 19          |
|    iterations           | 2           |
|    time_elapsed         | 819         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.024473336 |
|    clip_fraction        | 0.388    

<stable_baselines3.ppo.ppo.PPO at 0x123bd81fe20>

In [15]:
env = VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor_s3.cfg')
model= PPO.load('./train/train_deadly2/best_model_40000')
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR3)
model.set_env(env)

model.learn(total_timesteps=40000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_online\PPO_19
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | 965      |
| time/              |          |
|    fps             | 39       |
|    iterations      | 1        |
|    time_elapsed    | 208      |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 106          |
|    ep_rew_mean          | 929          |
| time/                   |              |
|    fps                  | 24           |
|    iterations           | 2            |
|    time_elapsed         | 671          |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0071155545 |
|    clip_fraction        

<stable_baselines3.ppo.ppo.PPO at 0x123b75e79d0>

In [14]:
env = VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor_s4.cfg')
model=PPO.load('./train/train_deadly3/best_model_40000')
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR4)
model.set_env(env)

model.learn(total_timesteps=40000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_online\PPO_20
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 104      |
|    ep_rew_mean     | 879      |
| time/              |          |
|    fps             | 26       |
|    iterations      | 1        |
|    time_elapsed    | 311      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 96.1        |
|    ep_rew_mean          | 931         |
| time/                   |             |
|    fps                  | 18          |
|    iterations           | 2           |
|    time_elapsed         | 864         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.005402882 |
|    clip_fraction        | 0.261    

<stable_baselines3.ppo.ppo.PPO at 0x186ffbc4e50>

In [17]:
env = VizDoomGym(config='github/VizDoom/scenarios/deadly_corridor.cfg')
model=PPO.load('./train/train_deadly4/best_model_40000')
callback= TrainAndLoggingCallback(check_freq=10000,save_path=CHECKPOINT_DIR5)
model.set_env(env)

model.learn(total_timesteps=40000, callback=callback)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
Logging to ./logs/log_deadly_online\PPO_21
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 35.8     |
|    ep_rew_mean     | 375      |
| time/              |          |
|    fps             | 31       |
|    iterations      | 1        |
|    time_elapsed    | 262      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 32.9        |
|    ep_rew_mean          | 342         |
| time/                   |             |
|    fps                  | 20          |
|    iterations           | 2           |
|    time_elapsed         | 789         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.044699937 |
|    clip_fraction        | 0.529    

<stable_baselines3.ppo.ppo.PPO at 0x18686f0fd00>

# Test the model

In [8]:
#import eval policy
from stable_baselines3.common.evaluation import evaluate_policy

In [57]:
modeltest= PPO.load('./train/train_deadly_online/best_model_110000')

In [18]:
modeltest= PPO.load('./train/train_deadly5/best_model_40000')

In [22]:
env=VizDoomGym(render=True,config='github/VizDoom/scenarios/deadly_corridor_s2.cfg')

In [12]:
#mean reward for 10 games 
mean_rew, _ = evaluate_policy(modeltest,env,n_eval_episodes=10)



In [23]:
for episode in range(5): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = modeltest.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(episode, total_reward))
    time.sleep(2)

Total Reward for episode 0 is -93.43565368652344
Total Reward for episode 1 is 162.76583862304688
Total Reward for episode 2 is 2050.3453979492188
Total Reward for episode 3 is 53.37300109863281
Total Reward for episode 4 is 1141.3853454589844


In [24]:
env.close()