In [None]:
import wandb
wandb.init(project="test")

In [None]:
from wandb.integration.sb3 import WandbCallback

In [1]:
'''
A large part of the code in this file was sourced from the rl-baselines-zoo library on GitHub.
In particular, the library provides a great parameter optimization set for the PPO2 algorithm,
as well as a great example implementation using optuna.
Source: https://github.com/araffin/rl-baselines-zoo/blob/master/utils/hyperparams_opt.py
'''

import optuna

import pandas as pd
import numpy as np

from pathlib import Path
import time


import numpy as np
import os
import datetime
import csv
import argparse
from functools import partial


import gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder, SubprocVecEnv
#from wandb.integration.sb3 import WandbCallback
#import wandb

#env = Template_Gym()
#from stable_baselines.gail import generate_expert_traj

#from stable_baselines.gail import ExpertDataset


timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S')
config = {"policy_type": "MlpPolicy", "total_timesteps": 25000}
experiment_name = f"PPO_{int(time.time())}"
class Optimization():
    def __init__(self):

        self.reward_strategy = 'sortino2'
        #self.input_data_file = 'data/coinbase_hourly.csv'
        self.params_db_file = 'sqlite:///params.db'

        # number of parallel jobs
        self.n_jobs = 1
        # maximum number of trials for finding the best hyperparams
        self.n_trials = 100
        #number of test episodes per trial
        self.n_test_episodes = 10
        # number of evaluations for pruning per trial
        self.n_evaluations = 10


        #self.df = pd.read_csv(input_data_file)
        #self.df = df.drop(['Symbol'], axis=1)
        #self.df = df.sort_values(['Date'])
        #self.df = add_indicators(df.reset_index())

        #self.train_len = int(len(df) * 0.8)

        #self.df = df[:train_len]

        #self.validation_len = int(train_len * 0.8)
        #self.train_df = df[:validation_len]
        #self.test_df = df[validation_len:]

    def make_env(self, env_id, rank, seed=0, eval=False):
        """
        Utility function for multiprocessed env.
    
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """
        def _init():
            self.eval= eval
            env = gym.make("CartPole-v1")
            env.seed(seed + rank)
            return env
        #set_global_seeds(seed)
        return _init
    #def make_env():
        #env = gym.make("CartPole-v1")
        #env = Monitor(env)  # record stats such as returns
        #return env
    

    def optimize_envs(self, trial):
        return {
            'reward_func': self.reward_strategy,
            'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)),
            'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99),
        }


    def optimize_ppo2(self,trial):
        return {
            'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)),
            'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
            'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
            'clip_range': trial.suggest_uniform('clip_range', 0.1, 0.4),
            'n_epochs': int(trial.suggest_loguniform('n_epochs', 1, 48)),
            #'lam': trial.suggest_uniform('lam', 0.8, 1.)
        }


    def optimize_agent(self,trial):
        #self.env_params = self.optimize_envs(trial)
        env_id = "default"
        num_e = 1  # Number of processes to use
        env = gym.make("CartPole-v1")
        #self.train_env = DummyVecEnv([lambda: env()])
        self.train_env = gym.make('CartPole-v1')
        #self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True)
        #self.test_env = DummyVecEnv([lambda: env()])
        self.test_env = env = gym.make('CartPole-v1')
        #self.test_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True)

        self.model_params = self.optimize_ppo2(trial)
        
        self.model = PPO(config["policy_type"], self.train_env, verbose=0, tensorboard_log=Path("./tensorboard2").name, **self.model_params)
        #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" )

        last_reward = -np.finfo(np.float16).max
        #evaluation_interval = int(len(train_df) / self.n_evaluations)
        evaluation_interval = 3000

        for eval_idx in range(self.n_evaluations):
            try:
                self.model.learn(evaluation_interval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            obs = self.test_env.reset()
            while n_episodes < self.n_test_episodes:
                action, _ = self.model.predict(obs)
                obs, reward, done, _ = self.test_env.step(action)
                reward_sum += reward

                if done:
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = self.test_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            #if trial.should_prune(eval_idx):
                #raise optuna.structs.TrialPruned()

        return -1 * last_reward


    def optimize(self):
        study_name = 'ppo299_' + self.reward_strategy
        #study = optuna.create_study(
            #study_name=study_name, storage=self.params_db_file, load_if_exists=True)
        study = optuna.create_study(
            study_name=study_name, storage=self.params_db_file, load_if_exists=True)
        try:
            study.optimize(self.optimize_agent, n_trials=self.n_trials, n_jobs=self.n_jobs)
        except KeyboardInterrupt:
            pass

        print('Number of finished trials: ', len(study.trials))

        print('Best trial:')
        trial = study.best_trial

        print('Value: ', trial.value)

        print('Params: ')
        for key, value in trial.params.items():
            print('    {}: {}'.format(key, value))

        return study.trials_dataframe()

#if __name__ == '__main__':
run = Optimization()
run.optimize()

[32m[I 2021-07-30 16:43:01,229][0m A new study created in RDB with name: ppo299_sortino2[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=375 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2021-07-30 16:43:17,711][0m Trial 0 finished with value: -69.7 and parameters: {'n_steps': 375.96192885266925, 'gamma': 0.9655210634642647, 'learning_rate': 0.00015848728363978832, 'ent_coef': 6.774265231303632e-08, 'clip_range': 0.1995232886017248, 'n_epochs': 1.0043708215715677}. Best is trial 0 with value: -69.7.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1150 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2021-07-30 16:43:45,403][0m Trial 1 finished with value: -365.1 and parameters: {'n_steps': 1150.4439126991792, 'gamma': 0.9363397206393553, 'learning_rate': 0.002574101011476685, 'ent_coef': 3.4885903722619094e-05, 'clip_range': 0.

[32m[I 2021-07-30 16:50:29,030][0m Trial 10 finished with value: -9.3 and parameters: {'n_steps': 2047.5955765610604, 'gamma': 0.922780686741381, 'learning_rate': 0.12265437843834635, 'ent_coef': 2.4820350032650058e-05, 'clip_range': 0.11235873603905017, 'n_epochs': 47.40673532043894}. Best is trial 1 with value: -365.1.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=594 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m[I 2021-07-30 16:50:53,807][0m Trial 11 finished with value: -149.2 and parameters: {'n_steps': 594.8419406475872, 'gamma': 0.9281308448569101, 'learning_rate': 0.0013746572057786502, 'ent_coef': 0.00014634722194711283, 'clip_range': 0.17400432928595921, 'n_epochs': 1.8976793038087894}. Best is trial 1 with value: -365.1.[0m
We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=743 and n_envs=1)
  f"You have specified a mini-batch size of {batch_size},"
[32m

Number of finished trials:  19
Best trial:
Value:  -422.9
Params: 
    clip_range: 0.22329532529062732
    ent_coef: 4.7176480359337634e-06
    gamma: 0.9996734300341857
    learning_rate: 0.0007868638742019481
    n_epochs: 4.10739361874997
    n_steps: 1119.9072750899527


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_clip_range,params_ent_coef,params_gamma,params_learning_rate,params_n_epochs,params_n_steps,state
0,0,-69.7,2021-07-30 16:43:01.270852,2021-07-30 16:43:17.682306,00:00:16.411454,0.199523,6.774265e-08,0.965521,0.000158,1.004371,375.961929,COMPLETE
1,1,-365.1,2021-07-30 16:43:17.714981,2021-07-30 16:43:45.369104,00:00:27.654123,0.168132,3.48859e-05,0.93634,0.002574,3.222209,1150.443913,COMPLETE
2,2,-103.0,2021-07-30 16:43:45.407851,2021-07-30 16:44:34.835498,00:00:49.427647,0.344423,2.034752e-06,0.937947,0.004233,22.986353,168.052912,COMPLETE
3,3,-9.6,2021-07-30 16:44:34.866702,2021-07-30 16:44:55.407044,00:00:20.540342,0.239389,0.001057548,0.984803,0.017578,2.751369,95.54045,COMPLETE
4,4,-87.5,2021-07-30 16:44:55.435277,2021-07-30 16:45:18.965896,00:00:23.530619,0.3858,1.395738e-06,0.969958,4.5e-05,4.128084,45.899706,COMPLETE
5,5,-221.7,2021-07-30 16:45:19.001041,2021-07-30 16:45:39.779680,00:00:20.778639,0.204749,0.0007772477,0.990752,0.002455,1.494602,284.360187,COMPLETE
6,6,-138.0,2021-07-30 16:45:39.817929,2021-07-30 16:46:14.635955,00:00:34.818026,0.304674,0.01092933,0.950058,3.4e-05,8.150697,37.375459,COMPLETE
7,7,-51.5,2021-07-30 16:46:14.669138,2021-07-30 16:46:43.976614,00:00:29.307476,0.282713,4.258884e-05,0.904336,1e-05,7.377646,1971.999042,COMPLETE
8,8,-9.2,2021-07-30 16:46:44.008058,2021-07-30 16:48:31.672601,00:01:47.664543,0.114702,0.001746112,0.91155,0.675674,19.177987,17.360869,COMPLETE
9,9,-124.3,2021-07-30 16:48:31.702212,2021-07-30 16:48:53.001079,00:00:21.298867,0.381342,3.246627e-08,0.950959,0.000258,1.646246,36.887792,COMPLETE


In [4]:
import time
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
from wandb.integration.sb3 import WandbCallback
import wandb

config = {"policy_type": "MlpPolicy", "total_timesteps": 25000}
experiment_name = f"PPO_{int(time.time())}"

# Initialise a W&B run
wandb.init(
    name=experiment_name,
    project="test",
    config=config,
    sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
    monitor_gym=True,  # auto-upload the videos of agents playing the game
    save_code=True,  # optional
)

def make_env():
    env = gym.make("CartPole-v1")
    env = Monitor(env)  # record stats such as returns
    return env

env = DummyVecEnv([make_env])

env = VecVideoRecorder(env, "videos",
    record_video_trigger=lambda x: x % 2000 == 0, video_length=200)

model = PPO(config["policy_type"], env, verbose=1,
    tensorboard_log=f"runs/{experiment_name}")

# Add the WandbCallback 
model.learn(
    total_timesteps=config["total_timesteps"],
    callback=WandbCallback(
        gradient_save_freq=100,
        model_save_freq=1000,
        model_save_path=f"models/{experiment_name}",
    ),
)

wandb: Currently logged in as: adaptationai (use `wandb login --relogin` to force relogin)
wandb: wandb version 0.11.1 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Using cpu device
Logging to runs/PPO_1627624999/PPO_1
Saving video to /home/adaptation/Documents/github/adaptationio/Shaman-AI/videos/rl-video-step-0-to-step-200.mp4
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.4     |
|    ep_rew_mean     | 23.4     |
| time/              |          |
|    fps             | 380      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
Saving video to /home/adaptation/Documents/github/adaptationio/Shaman-AI/videos/rl-video-step-2000-to-step-2200.mp4
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 27.4         |
|    ep_rew_mean          | 27.4         |
| time/                   |              |
|    fps                  | 367          |
|    iterations           | 2            |
|    time_elapsed         | 11           |
|    total_timesteps      | 4096  

Saving video to /home/adaptation/Documents/github/adaptationio/Shaman-AI/videos/rl-video-step-20000-to-step-20200.mp4
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 145          |
|    ep_rew_mean          | 145          |
| time/                   |              |
|    fps                  | 384          |
|    iterations           | 10           |
|    time_elapsed         | 53           |
|    total_timesteps      | 20480        |
| train/                  |              |
|    approx_kl            | 0.0050595244 |
|    clip_fraction        | 0.0241       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.566       |
|    explained_variance   | 0.499        |
|    learning_rate        | 0.0003       |
|    loss                 | 3.25         |
|    n_updates            | 90           |
|    policy_gradient_loss | -0.000633    |
|    value_loss           | 46.5         |
----------------------

KeyboardInterrupt: 

In [None]:
import gym

from stable_baselines3 import PPO

env = gym.make("CartPole-v1")

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
      obs = env.reset()

env.close()

In [None]:
"""
    Sampler for PPO hyperparams.
    :param trial:
    :return:
    """
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
    n_steps = trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    lr_schedule = "constant"
    # Uncomment to enable learning rate schedule
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
    gae_lambda = trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0])
    max_grad_norm = trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5])
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
    # Uncomment for gSDE (continuous actions)
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
    # Uncomment for gSDE (continuous action)
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
    # Orthogonal initialization
    ortho_init = False
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

In [2]:
'''
A large part of the code in this file was sourced from the rl-baselines-zoo library on GitHub.
In particular, the library provides a great parameter optimization set for the PPO2 algorithm,
as well as a great example implementation using optuna.
Source: https://github.com/araffin/rl-baselines-zoo/blob/master/utils/hyperparams_opt.py
'''

import optuna

import pandas as pd
import numpy as np

from pathlib import Path
import time

import gym
import numpy as np
import os
import datetime
import csv
import argparse
from functools import partial
import time
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder, SubprocVecEnv, VecNormalize 
#from stable_baselines3 import PPO
#from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
#from wandb.integration.sb3 import WandbCallback
#import wandb


#from stable_baselines.common.policies import MlpLnLstmPolicy, LstmPolicy, CnnPolicy, MlpPolicy
#from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv,VecNormalize 
#from stable_baselines3.common import set_global_seeds
#from stable_baselines import ACKTR, PPO2, SAC
#from stable_baselines.deepq import DQN
#from stable_baselines.deepq.policies import FeedForwardPolicy
#from ..env import Template_Gym
#from ..common import CustomPolicy, CustomPolicy_2, CustomLSTMPolicy, CustomPolicy_4, CustomPolicy_3, CustomPolicy_5
#from ..common import PairList, PairConfig, PairsConfigured
#env = Template_Gym()
#from stable_baselines.gail import generate_expert_traj

#from stable_baselines.gail import ExpertDataset


timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S')
#pc = PairsConfigured()

config = {"policy_type": "MlpPolicy", "total_timesteps": 25000}
experiment_name = f"PPO_{int(time.time())}"


class Optimization():
    def __init__(self, config):
        
        self.reward_strategy = 'Name it'
        #self.input_data_file = 'data/coinbase_hourly.csv'
        self.params_db_file = 'sqlite:///params.db'

        # number of parallel jobs
        self.n_jobs = 1
        # maximum number of trials for finding the best hyperparams
        self.n_trials = 100
        #number of test episodes per trial
        self.n_test_episodes = 10
        # number of evaluations for pruning per trial
        self.n_evaluations = 10
        self.config = config

        #self.df = pd.read_csv(input_data_file)
        #self.df = df.drop(['Symbol'], axis=1)
        #self.df = df.sort_values(['Date'])
        #self.df = add_indicators(df.reset_index())

        #self.train_len = int(len(df) * 0.8)

        #self.df = df[:train_len]

        #self.validation_len = int(train_len * 0.8)
        #self.train_df = df[:validation_len]
        #self.test_df = df[validation_len:]

    #def make_env(self, env_id, rank, seed=0, eval=False):
        """
        Utility function for multiprocessed env.
    
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """
        #def _init():
            #self.config = config
            #self.eval= eval
            #env = gym.make(config["env_name"])
            #env = Monitor(env)
            #env = Template_Gym(config=self.config, eval=self.eval)
            #env.seed(seed + rank)
            #return env
        #set_global_seeds(seed)
        #return _init
    #def make_env(env_id, rank, seed=0):
        """
        Utility function for multiprocessed env.

        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environments you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """
        #def _init():
            #env = gym.make(env_id)
            #env.seed(seed + rank)
            #return env
        #set_random_seed(seed)
        #return _init
    
    def make_env():
        env = gym.make(config["env_name"])
        env = Monitor(env)  # record stats such as returns
        return env
    
    # Categorical parameter
    #optimizer = trial.suggest_categorical('optimizer', ['MomentumSGD', 'Adam'])

    # Int parameter
    #num_layers = trial.suggest_int('num_layers', 1, 3)

    # Uniform parameter
    #dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 1.0)

    # Loguniform parameter
    #learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)

    # Discrete-uniform parameter
    #drop_path_rate = trial.suggest_discrete_uniform('drop_path_rate', 0.0, 1.0, 0.1)
    def optimize_envs(self, trial):
        return {
            'reward_func': self.reward_strategy,
            'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)),
            'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99),
        }

    def optimize_config(self, trial):
        return {
            'sl': trial.suggest_loguniform('sl', 1.0, 10.0),
            'tp': trial.suggest_loguniform('tp', 1.0 ,10.0)
            
        }

    def optimize_ppo2(self,trial):
        return {
            #'n_steps': int(trial.suggest_int('n_steps', 16, 2048)),
            #'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999),
            #'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.),
            #'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
            #'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4),
            #'noptepochs': int(trial.suggest_int('noptepochs', 1, 48)),
            #'lam': trial.suggest_uniform('lam', 0.8, 1.)
            
            
            'batch_size': trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512]),
            'n_steps': int(trial.suggest_categorical("n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048])),
            'gamma': trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]),
            'learning_rate': trial.suggest_loguniform("learning_rate", 1e-5, 1),
            #'lr_schedule' = "constant"
            # Uncomment to enable learning rate schedule
            # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
            'ent_coef': trial.suggest_loguniform("ent_coef", 0.00000001, 0.1),
            'clip_range': trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4]),
            'n_epochs': trial.suggest_categorical("n_epochs", [1, 5, 10, 20]),
            'gae_lambda': trial.suggest_categorical("gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]),
            'max_grad_norm': trial.suggest_categorical("max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]),
            'vf_coef': trial.suggest_uniform("vf_coef", 0, 1)
            #'net_arch' = trial.suggest_categorical("net_arch", ["small", "medium"])
            # Uncomment for gSDE (continuous actions)
            # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
            # Uncomment for gSDE (continuous action)
            # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
            # Orthogonal initialization
            #ortho_init = False
            # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
            # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
            #'activation_fn': trial.suggest_categorical("activation_fn", ["tanh", "relu"])
        }

    def optimize_lstm(self, trial):
        return {
            'lstm': trial.suggest_categorical('optimizer', ['lstm', 'mlp'])
            
        }
    def ob_types(self, trial):
        return {
            'lstm': trial.suggest_categorical('optimizer', ['lstm', 'mlp'])
            
        }


    def optimize_agent(self,trial):
        run = wandb.init(
        project="sb3",
        config=config,
        sync_tensorboard=True,  # auto-upload sb3's tensorboard metrics
        monitor_gym=True,  # auto-upload the videos of agents playing the game
        save_code=True,  # optional
        )
        #self.env_params = self.optimize_envs(trial)
        env_id = "default"+str()
        num_e = self.n_jobs  # Number of processes to use
        #self.config_param = self.optimize_config(trial)
        #self.config.sl = self.config_param['sl']
        #self.config.sl = self.config_param['tp']
        #self.model_type = self.optimize_lstm(trial)
        #self.model_type = self.model_type['lstm']
        #self.model_type = "mlp"
        #if self.model_type == 'mlp':
            #self.policy = CustomPolicy_5
        #else:
             #self.policy = MlpPolicy
        #self.train_env = SubprocVecEnv([self.make_env(env_id+str('train'), i) for i in range(num_e)])
        #SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
        #self.train_env = SubprocVecEnv([self.make_env(env_id, i, eval=False) for i in range(num_e)])
        #self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True)
        #self.test_env = SubprocVecEnv([self.make_env(env_id+str("test"), i) for i in range(num_e)])
        #self.test_env = SubprocVecEnv([self.make_env(env_id, i, eval=True) for i in range(num_e)])
        #self.test_env = VecNormalize(self.test_env, norm_obs=True, norm_reward=True)
        env = gym.make("CartPole-v1")
        self.train_env = DummyVecEnv([lambda: env])
        self.train_env = VecVideoRecorder(self.train_env, "videos", record_video_trigger=lambda x: x % 2000 == 0, video_length=200)
        #self.train_env = DummyVecEnv([env])
        #self.train_env = VecNormalize(self.train_env, norm_obs=True, norm_reward=True)
        self.test_env = DummyVecEnv([lambda: env])
        self.test_env = VecVideoRecorder(self.test_env, "videos", record_video_trigger=lambda x: x % 2000 == 0, video_length=200)
        #self.test_env = DummyVecEnv([env])
        try:
            self.test_env.load_running_average("saves")
            self.train_env.load_running_average("saves")
        except:
            print('cant load')
        self.model_params = self.optimize_ppo2(trial)
        self.model = PPO(config["policy_type"], self.train_env, verbose=1, tensorboard_log=f"runs", **self.model_params )
        #self.model = PPO2(CustomPolicy_2, self.env, verbose=0, learning_rate=1e-4, nminibatches=1, tensorboard_log="./min1" )

        last_reward = -np.finfo(np.float16).max
        #evaluation_interval = int(len(train_df) / self.n_evaluations)
        evaluation_interval = 3500

        for eval_idx in range(self.n_evaluations):
            try:
                #self.model.learn(evaluation_interval)
                self.model.learn(
                    total_timesteps=evaluation_interval,
                    callback=WandbCallback(gradient_save_freq=100,
                    model_save_path=f"models/{run.id}",
                    verbose=2,
                    ),
                )
                #self.test_env.save_running_average("saves")
                #self.train_env.save_running_average("saves")
            except:
                print('did not work')

            rewards = []
            n_episodes, reward_sum = 0, 0.0
            print('Eval')
            obs = self.test_env.reset()
            #state = None
            #done = [False for _ in range(self.env.num_envs)]
            while n_episodes < self.n_test_episodes:
                action, _ = self.model.predict(obs, deterministic=True)
                obs, reward, done, _ = self.test_env.step(action)
                reward_sum += reward

                if done:
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = self.test_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune():
                raise optuna.structs.TrialPruned()
        run.finish()
        return -1 * last_reward


    def optimize(self, config):
        self.config = config
        study_name = 'ppo2_single_ready'
        study_name = 'ppo2_single_ready_nosltp'
        study_name = 'ppo2_single_ready_nosltp_all_yeah'
        study_name = 'ppo2_eur_gbp_op'
        study_name = 'ppo2_gbp_chf_op'
        study_name = 'ppo2_gbp_chf_h1_new1'
        study_name = 'ppo2_gbp_chf_h4_r_new11'
        study_name = 'ppo2_gbp_chf_h4_r_withvolfixed'
        study_name = 'ppo2_gbp_chf_h4_r_withvolclosefix212'
        study_name = 'ppo2_gbp_chf_h4_loged_sortinonew'
        study_name = 'AUD_CHF_4H_SELL_C5_NEW'
        study_name = 'wandb'
        study = optuna.create_study(
            study_name=study_name, storage=self.params_db_file, load_if_exists=True)

        try:
            study.optimize(self.optimize_agent, n_trials=self.n_trials, n_jobs=self.n_jobs)
        except KeyboardInterrupt:
            pass

        print('Number of finished trials: ', len(study.trials))

        print('Best trial:')
        trial = study.best_trial
        print(trial.number)
        print('Value: ', trial.value)

        print('Params: ')
        for key, value in trial.params.items():
            print('    {}: {}'.format(key, value))

        return study.trials_dataframe()


#if __name__ == '__main__':
    #optimize()
run = Optimization(config)
run.optimize(config)

SyntaxError: invalid syntax (<ipython-input-2-2a0fba73efa3>, line 246)