In [1]:
import torch
import numpy as np 
from stable_baselines3 import PPO
from gym_functions import *


In [3]:
# Define a class for parameters
class TradingParameters:
    def __init__(self):
        # Define all necessary parameters
        self.S0 = 100.0  # Initial stock price
        self.sigma = 0.02  # Daily volatility
        self.T = 10  # Total time in trading days
        self.dt = 1  # Time step in days
        self.K = np.array([95, 100, 105])  # Strike prices
        self.time = np.array([30, 30, 30])  # Time to maturity in days
        self.r = 0.01  # Risk-free rate
        self.V = np.eye(len(self.K)) * 0.1  # Volatility matrix for shocks
        self.psi0 = 0.1  # Terminal inventory liquidation cost coefficient
        self.A_plus = np.array([100, 100, 100])  # Arrival rate parameters for buy orders
        self.kappa_plus = np.array([1.5, 1.5, 1.5])  # Sensitivity parameters for buy orders
        self.A_minus = np.array([100, 100, 100])  # Arrival rate parameters for sell orders
        self.kappa_minus = np.array([1.5, 1.5, 1.5])  # Sensitivity parameters for sell orders
        self.bid_ranges = [(0.01, 1.0) for _ in range(len(self.K))]  # Bid spread ranges
        self.ask_ranges = [(0.01, 1.0) for _ in range(len(self.K))]  # Ask spread ranges
        self.gamma = 0.1  # Entropy regularization parameter

In [4]:
paras = TradingParameters()
env = OptionMarketMakingEnv(paras)

# Define policy kwargs if needed
policy_kwargs = dict(
    features_extractor_class=CustomFeatureExtractor,
    features_extractor_kwargs=dict(features_dim=128),
)

# Initialize the PPO model
model = PPO(
    policy='MlpPolicy',
    env=env,
    verbose=1,
    ent_coef=paras.gamma,  # Set entropy coefficient to the desired value
    policy_kwargs=policy_kwargs,
)

# Train the agent
model.learn(total_timesteps=1000)  # Adjust timesteps as needed

# Run the trained agent
obs = env.reset()
done = False
total_reward = 0
while not done:
    action, _states = model.predict(obs, deterministic=False)
    obs, reward, done, info = env.step(action)
    total_reward += reward
    print(f"Step: {env.current_step}, Reward: {reward}, Total Reward: {total_reward}")


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 9         |
|    ep_rew_mean     | -1.64e+05 |
| time/              |           |
|    fps             | 288       |
|    iterations      | 1         |
|    time_elapsed    | 7         |
|    total_timesteps | 2048      |
----------------------------------
Step: 1, Reward: 68.04024113295536, Total Reward: 68.04024113295536
Step: 2, Reward: 37.65628164379281, Total Reward: 105.69652277674817
Step: 3, Reward: 95.43222582424394, Total Reward: 201.1287486009921
Step: 4, Reward: 102.73945204870807, Total Reward: 303.86820064970016
Step: 5, Reward: 93.64717079225306, Total Reward: 397.5153714419532
Step: 6, Reward: 56.35842246457719, Total Reward: 453.8737939065304
Step: 7, Reward: -3.241483680636726, Total Reward: 450.63231022589366
Step: 8, Reward: 68.74270891343156, Total Reward: 519.3750191393252
Step: 9, Reward: -344784.954334621, Total Reward: -344265.5793154817
