In [62]:
! pip install gym gym-retro 



In [63]:
!pip install torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio===0.10.2+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html


Looking in links: https://download.pytorch.org/whl/cu113/torch_stable.html


### Imports and basic testing of installation

In [64]:
import retro # The main library
import time # For timing learning, if needed
import pygame # For rendering the game

In [65]:
retro.data.list_games()
# There are actually many games provided by Gym Retro: this command let's you check them out!

['1942-Nes',
 '1943-Nes',
 '3NinjasKickBack-Genesis',
 '8Eyes-Nes',
 'AaahhRealMonsters-Genesis',
 'AbadoxTheDeadlyInnerWar-Nes',
 'AcceleBrid-Snes',
 'ActRaiser2-Snes',
 'ActionPachio-Snes',
 'AddamsFamily-GameBoy',
 'AddamsFamily-Genesis',
 'AddamsFamily-Nes',
 'AddamsFamily-Sms',
 'AddamsFamily-Snes',
 'AddamsFamilyPugsleysScavengerHunt-Nes',
 'AddamsFamilyPugsleysScavengerHunt-Snes',
 'AdvancedBusterhawkGleylancer-Genesis',
 'Adventure-Atari2600',
 'AdventureIsland-GameBoy',
 'AdventureIsland3-Nes',
 'AdventureIslandII-Nes',
 'AdventuresOfBatmanAndRobin-Genesis',
 'AdventuresOfBayouBilly-Nes',
 'AdventuresOfDinoRiki-Nes',
 'AdventuresOfDrFranken-Snes',
 'AdventuresOfKidKleets-Snes',
 'AdventuresOfMightyMax-Genesis',
 'AdventuresOfMightyMax-Snes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Genesis',
 'AdventuresOfRockyAndBullwinkleAndFriends-Nes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Snes',
 'AdventuresOfStarSaver-GameBoy',
 'AdventuresOfYogiBear-Snes',
 'AeroFighters-Snes',
 

In [66]:
#for making grayscale, decreasing pixels and making process faster
from gym import Env
from gym.spaces import Discrete, Box, MultiBinary
import numpy as np
import cv2

In [67]:
#creating a custom environment
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', use_restricted_actions=retro.Actions.FILTERED)
        #self.score = 0
    
    def step(self, action):
        obs, reward, done, info = self.game.step(action)
        obs = self.preprocess(obs)
        
        # Preprocess frame from game
        frame_delta = obs - self.previous_frame
        self.previous_frame = obs
        
        # Shape reward
        reward = info['score'] - self.score 
        self.score = info['score']

        return frame_delta, reward, done, info 
    
    def render(self, *args, **kwargs): 
        self.game.render(*args, **kwargs)
    
    def reset(self):
        self.previous_frame = np.zeros(self.game.observation_space.shape)
        
        # Frame delta
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        self.health = 176
        self.enemy_health = 176
        
        # Create initial variables
        self.score = 0

        return obs
    
    def preprocess(self, observation): 
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (84,84), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (84,84,1))
        return state
    
    def close(self): 
        self.game.close()


In [70]:
env = StreetFighter()

In [71]:
env.observation_space.shape

(84, 84, 1)

In [72]:
model = PPO.load(os.path.join(OPT_DIR, "trial_0_best_model.zip"))

In [73]:
# Import optuna for HPO
import optuna
# Import PPO for algos
from stable_baselines3 import PPO
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import wrappers
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import os

In [83]:
LOG_DIR = './logs/'
OPT_DIR = './opt_nodelta/'

In [89]:
# #https://github.com/araffin/rl-baselines-zoo/issues/29
def optimize_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': trial.suggest_int('n_steps', 2048, 8192),
        'gamma': trial.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda': trial.suggest_uniform('gae_lambda', 0.8, .99)
    }

In [94]:
def optimize_agent(trial):
    #try:
        model_params = optimize_ppo(trial)
        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=100000)
        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trial2_{}_best_model'.format(trial.number))
        model.save(SAVE_PATH)
        return mean_reward
    #except Exception as e: 
       # return -1000

In [95]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[32m[I 2023-08-06 00:21:44,382][0m A new study created in memory with name: no-name-8b50801e-e0b6-4b35-aeaf-30c5470d941d[0m
  
  import sys
  
  if __name__ == '__main__':
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3306 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2023-08-06 01:34:14,359][0m Trial 0 finished with value: 1000.0 and parameters: {'n_steps': 3306, 'gamma': 0.9522440249359578, 'learning_rate': 8.125903885885212e-05, 'clip_range': 0.29781712531164484, 'gae_lambda': 0.9191355428852187}. Best is trial 0 with value: 1000.0.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7750 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2023-08-06 02:41:28,468][0m Trial 1 finished with value: 5000.0 and parameters: {'n_steps': 7750, 'gamma': 0.9393040302387321, 'learning_rate': 4.0726384300541615e-05, 'clip_range': 0.127613959957

In [96]:
# Import os for file path management
import os 
# Import Base Callback for saving models
from stable_baselines3.common.callbacks import BaseCallback

In [26]:
LOG_DIR = "logs"
OPT_DIR = "optimized_models"


    
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)
    print(f"Directory '{LOG_DIR}' created.")

    # Create the OPT_DIR directory if it doesn't exist
if not os.path.exists(OPT_DIR):
    os.makedirs(OPT_DIR)
    print(f"Directory '{OPT_DIR}' created.")

Directory 'logs' created.
Directory 'optimized_models' created.


In [97]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [98]:
CHECKPOINT_DIR = './train_2/'


In [99]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [100]:
env = StreetFighter()
env = Monitor(env, LOG_DIR)
env = DummyVecEnv([lambda: env])
env = VecFrameStack(env, 4, channels_order='last')

In [120]:
model_params=study.best_params


In [121]:
model_params

{'n_steps': 5891,
 'gamma': 0.9647082311175234,
 'learning_rate': 1.6018206993126623e-05,
 'clip_range': 0.35672935145727014,
 'gae_lambda': 0.8930387929631031}

model_params

In [122]:
model_params['n_steps']=5888

In [123]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, **model_params)

Using cpu device
Wrapping the env in a VecTransposeImage.


In [124]:
model.load(os.path.join(OPT_DIR, "trial2_6_best_model.zip"))

<stable_baselines3.ppo.ppo.PPO at 0x1f3bc4a81d0>

In [125]:
model.learn(total_timesteps=100000, callback=callback)

Logging to ./logs/PPO_25
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 5.66e+03 |
|    ep_rew_mean     | 4.4e+03  |
| time/              |          |
|    fps             | 115      |
|    iterations      | 1        |
|    time_elapsed    | 51       |
|    total_timesteps | 5888     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 5.66e+03    |
|    ep_rew_mean          | 4.4e+03     |
| time/                   |             |
|    fps                  | 40          |
|    iterations           | 2           |
|    time_elapsed         | 292         |
|    total_timesteps      | 11776       |
| train/                  |             |
|    approx_kl            | 0.015121972 |
|    clip_fraction        | 0.0171      |
|    clip_range           | 0.357       |
|    entropy_loss         | -8.31       |
|    explained_variance   | -0.00132    |
|    

<stable_baselines3.ppo.ppo.PPO at 0x1f38e6cbf60>

In [129]:
model = PPO.load('./train_2/best_model_170000')

In [130]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, render=True)

KeyboardInterrupt: 

### Your implementation

In [12]:
import time
# Import PPO for algos
from stable_baselines3 import PPO
# Evaluate Policy
from stable_baselines3.common.evaluation import evaluate_policy
# Import Wrappers
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecTransposeImage

  from .autonotebook import tqdm as notebook_tqdm
