In [48]:
import gym

In [49]:
import retro
import time

In [50]:
retro.data.list_games()

['1942-Nes',
 '1943-Nes',
 '3NinjasKickBack-Genesis',
 '8Eyes-Nes',
 'AaahhRealMonsters-Genesis',
 'AbadoxTheDeadlyInnerWar-Nes',
 'AcceleBrid-Snes',
 'ActRaiser2-Snes',
 'ActionPachio-Snes',
 'AddamsFamily-GameBoy',
 'AddamsFamily-Genesis',
 'AddamsFamily-Nes',
 'AddamsFamily-Sms',
 'AddamsFamily-Snes',
 'AddamsFamilyPugsleysScavengerHunt-Nes',
 'AddamsFamilyPugsleysScavengerHunt-Snes',
 'AdvancedBusterhawkGleylancer-Genesis',
 'Adventure-Atari2600',
 'AdventureIsland-GameBoy',
 'AdventureIsland3-Nes',
 'AdventureIslandII-Nes',
 'AdventuresOfBatmanAndRobin-Genesis',
 'AdventuresOfBayouBilly-Nes',
 'AdventuresOfDinoRiki-Nes',
 'AdventuresOfDrFranken-Snes',
 'AdventuresOfKidKleets-Snes',
 'AdventuresOfMightyMax-Genesis',
 'AdventuresOfMightyMax-Snes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Genesis',
 'AdventuresOfRockyAndBullwinkleAndFriends-Nes',
 'AdventuresOfRockyAndBullwinkleAndFriends-Snes',
 'AdventuresOfStarSaver-GameBoy',
 'AdventuresOfYogiBear-Snes',
 'AeroFighters-Snes',
 

In [51]:
from gym import Env
from gym.spaces import MultiBinary, Box
import numpy as np
import cv2
from matplotlib import pyplot as plt

In [52]:
class StreetFighter(Env):
    def __init__(self):
        super().__init__()
        #action space and obs space
        self.observation_space = Box(low=0, high=255, shape=(84, 84, 1), dtype=np.uint8)
        self.action_space = MultiBinary(12)
        #start game instance
        self.game = retro.make(game='StreetFighterIISpecialChampionEdition-Genesis', 
                               use_restricted_actions=retro.Actions.FILTERED)
    
    def step(self, action):
        obs, reward, done , info = self.game.step(action)
        obs = self.preprocess(obs)

        frame_delta = obs - self.previous_frame
        self.previous_frame = obs

        reward = info['score'] - self.score
        self.score = info['score'] 

        return frame_delta, reward, done, info
    
    def render(self, *args, **kwargs):
        self.game.render()
    
    def reset(self):
        obs = self.game.reset()
        obs = self.preprocess(obs)
        self.previous_frame = obs
        
        #change in score is the reward function
        self.score = 0
        return obs
    
    def preprocess(self, observation):
        #grayscale
        gray = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY)
        #resize
        resize = cv2.resize(gray, (84, 84), interpolation=cv2.INTER_CUBIC)
        #add channels value
        channels = np.reshape(resize, (84, 84, 1))
        
        return channels
    
    def close(self):
        self.game.close()

In [53]:
import optuna
from stable_baselines3 import PPO #ppo alg for rl, openai uses this as the main algo for training
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
import os

In [54]:
LOG_DIR = './logs'
OPT_DIR = './opt'


In [65]:
#objective function (to be replaced by user input at start of character creation)
def optimize_ppo(trail):
    return {
        'n_steps':trail.suggest_int('n_steps', 2058, 8192),
        'gamma':trail.suggest_loguniform('gamma', 0.8, 0.9999),
        'learning_rate':trail.suggest_loguniform('learning_rate', 1e-5, 1e-4),
        'clip_range':trail.suggest_uniform('clip_range', 0.1, 0.4),
        'gae_lambda':trail.suggest_uniform('gae_lambda', 0.8, 0.99)
    }

In [66]:
#training loop
def optimize_agent(trail):
    try:
        model_params = optimize_ppo(trail)

        env = StreetFighter()
        env = Monitor(env, LOG_DIR)
        env = DummyVecEnv([lambda: env])
        env = VecFrameStack(env, 4, channels_order='last')
        
        model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=0, **model_params)
        model.learn(total_timesteps=30000)

        mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=5)
        env.close()

        SAVE_PATH = os.path.join(OPT_DIR, 'trail_{}_model'.format(trail.number))
        model.save(SAVE_PATH)
        
        return mean_reward
    
    except Exception as e:
        return -1000

In [67]:
study = optuna.create_study(direction='maximize')
study.optimize(optimize_agent, n_trials=10, n_jobs=1)

[32m[I 2023-03-04 11:18:53,173][0m A new study created in memory with name: no-name-a390afba-5058-48a4-9d5d-9c0a2edcbd52[0m
  'gamma':trail.suggest_loguniform('gamma', 0.8, 0.9999),
  'learning_rate':trail.suggest_loguniform('learning_rate', 1e-5, 1e-4),
  'clip_range':trail.suggest_uniform('clip_range', 0.1, 0.4),
  'gae_lambda':trail.suggest_uniform('gae_lambda', 0.8, 0.99)
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3523 and n_envs=1)
[33m[W 2023-03-04 11:18:59,029][0m Trial 0 failed with parameters: {'n_steps': 3523, 'gamma': 0.8418228980643886, 'learning_rate': 1.4022023469669974e-05, 'clip_range': 0.2807338951303459, 'gae_lambda': 0.9190149817880865} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "c:\Users\ahmed\miniconda3\envs\gameEnvBase\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\ahmed\AppData\Local\T

KeyboardInterrupt: 