In [None]:
import numpy as np

# Old Version

In [None]:
class stock():
    def __init__(self, s0, r, sigma, T, n, model = 'gbm'):
        self.s0 = s0
        self.r = r
        self.T = T
        self.n = n
        self.dt = T/n
        self.model = model
        self.sigma = sigma

    def vol(self, sigma):
        if self.model == 'gbm':
            return np.array([sigma] * self.n)
        elif self.model == 'heston':
            # Use the Heston volatility path
            vol_path = self.vol(self.sigma)
            innovations = np.random.normal(0, 1, self.n)
            stock_prices = np.zeros(self.n)
            stock_prices[0] = self.s0

            for i in range(1, self.n):
                stock_prices[i] = stock_prices[i-1] * np.exp(
                    (self.r - 0.5 * vol_path[i]**2) * self.dt + vol_path[i] * np.sqrt(self.dt) * innovations[i]
                )
            return stock_prices # Implement Heston model volatility here


    def heston_model_sim(S0, v0, rho, kappa, theta, sigma,T, N, M):

        """
        Inputs:
         - S0, v0: initial parameters for asset and variance
         - rho   : correlation between asset returns and variance
         - kappa : rate of mean reversion in variance process
         - theta : long-term mean of variance process
         - sigma : vol of vol / volatility of variance process
         - T     : time of simulation
         - N     : number of time steps
         - M     : number of scenarios / simulations

        Outputs:
        - asset prices over time (numpy array)
        - variance over time (numpy array)
        """
        # initialise other parameters
        dt = T/N
        mu = np.array([0,0])
        cov = np.array([[1,rho],
                        [rho,1]])

        # arrays for storing prices and variances
        S = np.full(shape=(N+1,M), fill_value=S0)
        v = np.full(shape=(N+1,M), fill_value=v0)

        # sampling correlated brownian motions under risk-neutral measure
        Z = np.random.multivariate_normal(mu, cov, (N,M))

        for i in range(1,N+1):
            S[i] = S[i-1] * np.exp( (r - 0.5*v[i-1])*dt + np.sqrt(v[i-1] * dt) * Z[i-1,:,0] )
            v[i] = np.maximum(v[i-1] + kappa*(theta-v[i-1])*dt + sigma*np.sqrt(v[i-1]*dt)*Z[i-1,:,1],0)

        return S, v

    def simulate(self):
        innovations = np.random.normal(0, 1, self.n)
        stock_prices = np.zeros(self.n)
        stock_prices[0] = self.s0

        for i in range(1, self.n):
            stock_prices[i] = stock_prices[i-1] * np.exp((self.r - 0.5 * self.sigma**2) * self.dt + self.sigma * np.sqrt(self.dt) * innovations[i])
        return stock_prices

    def option_price(self, K):
        stock_prices = self.simulate()
        payoff = np.maximum(stock_prices[-1] - K, 0)
        return np.exp(-self.r * self.T) * np.mean(payoff)


In [None]:
class simulation():

    import numpy as np
    import matplotlib.pyplot as plt

    # Parameters for simulation
    T_steps = 50         # Number of time steps (T)
    K_paths = 1000       # Number of Monte Carlo paths (K)
    T_total = 1.0        # Total time horizon (years)
    dt = T_total / T_steps
    S0 = 100             # Initial stock price
    r = 0.05             # Risk-free rate
    sigma = 0.2          # Volatility
    strike = 100         # Strike price (Z in pseudocode)
    lambda_param = 0.5   # λ parameter

    def simulate_stock_prices(S0, r, sigma, T_steps, K_paths, dt):
      """
      Simulate stock prices using a geometric Brownian motion.
      Returns an array S of shape (T_steps+1, K_paths).
      """
      S = np.zeros((T_steps + 1, K_paths))
      S[0] = S0
      for t in range(1, T_steps + 1):
          z = np.random.standard_normal(K_paths)
          S[t] = S[t - 1] * np.exp((r - 0.5 * sigma**2) * dt + sigma * np.sqrt(dt) * z)
      return S

    # Simulate the stock paths
    S = simulate_stock_prices(S0, r, sigma, T_steps, K_paths, dt)

    # %% [code]
    # Compute the state variable X.
    # For example, one may take X as the log of S (common in QLBS literature).
    X = np.log(S)

    # %% [code]
    # Define N basis functions; here we use a simple polynomial basis (constant, linear, quadratic)
    def basis_function_1(x):
        return np.ones_like(x)

    def basis_function_2(x):
        return x

    def basis_function_3(x):
        return x**2

    basis_functions = [basis_function_1, basis_function_2, basis_function_3]
    N_basis = len(basis_functions)

    # Create the feature matrix phi with dimensions (T_steps+1, K_paths, N_basis)
    phi = np.zeros((T_steps + 1, K_paths, N_basis))
    for t in range(T_steps + 1):
        for n, func in enumerate(basis_functions):
            phi[t, :, n] = func(X[t])

    # %% [code]
    # Initialize arrays for the variables computed in the backward recursion
    # a_star, Pi, R_star, and Q_star each have shape (T_steps+1, K_paths)
    a_star = np.zeros((T_steps + 1, K_paths))
    Pi = np.zeros((T_steps + 1, K_paths))
    R_star = np.zeros((T_steps + 1, K_paths))
    Q_star = np.zeros((T_steps + 1, K_paths))

    # Terminal conditions at t = T_steps
    # Compute the option payoff: max(strike - S_T, 0)
    Pi[T_steps] = np.maximum(strike - S[T_steps], 0)
    # Center the terminal portfolio (subtracting the mean)
    Pi_hat_T = Pi[T_steps] - np.mean(Pi[T_steps])
    # Set terminal action to zero
    a_star[T_steps] = 0
    # Terminal risk measure (here using the variance of the payoff; note that var is a scalar)
    R_star[T_steps] = -lambda_param * np.var(Pi[T_steps])
    # Terminal Q-value (again, note that the λ·Var term is constant across paths)
    Q_star[T_steps] = -Pi[T_steps] - lambda_param * np.var(Pi[T_steps])

    # %% [code]
    # Backward recursion from t = T_steps-1 to t = 0
    for t in range(T_steps - 1, -1, -1):
        # === Compute a_star[t] as in (44) ===
        # Placeholder: in a full implementation, you would estimate a regression of the continuation value
        # on the features phi[t]. Here we set it to zero.
        a_star[t] = 0  # Replace with actual computation using phi[t]

        # === Compute Pi[t] as in (29) ===
        # Placeholder: here we mimic the terminal payoff but an actual update rule may be more complex.
        Pi[t] = np.maximum(strike - S[t], 0)  # Replace with your QLBS update rule

        # === Compute R_star[t] as in (41) ===
        # Placeholder: you might compute a risk measure update here.
        R_star[t] = R_star[t + 1]  # Replace with actual computation

        # === Compute Q_star[t] as in (45) ===
        # Placeholder: combine the immediate cost and risk measure.
        Q_star[t] = -Pi[t] - lambda_param * np.var(Pi[t])  # Replace with the proper formula

    # %% [code]
    # Calculate the QLBS option price at t = 0:
    # QLBS_price = - (1/K_paths) * sum_{k=1}^{K_paths} Q_star[0, k]
    QLBS_price = -np.mean(Q_star[0])
    print("QLBS Option Price at t=0:", QLBS_price)


QLBS Option Price at t=0: -0.0


# New Version

In [5]:
import numpy as np

class stock():
    def __init__(self, s0, r, sigma, T, n, model='gbm'):
        self.s0 = s0
        self.r = r
        self.T = T
        self.n = n
        self.dt = T / n
        self.model = model
        self.sigma = sigma  # For GBM, sigma is constant; for Heston, sigma is the initial volatility.
        # initial variance for Heston model
        if self.model == 'heston':
            self.kappa = 2.0         # speed of mean reversion
            self.theta = sigma**2    # long-run variance (theta)
            self.xi = 0.1            # volatility of volatility

    def vol(self, sigma):
        if self.model == 'gbm':
            return np.array([sigma] * self.n)
        elif self.model == 'heston':
            # initialize variance process
            v = np.zeros(self.n)
            v[0] = sigma**2 
            for i in range(1, self.n):
                # Euler-Maruyama update for variance
                dv = self.kappa * (self.theta - v[i-1]) * self.dt + self.xi * np.sqrt(max(v[i-1], 0)) * np.sqrt(self.dt) * np.random.normal()
                v[i] = v[i-1] + dv
                # Ensure non-negativity (using full truncation)
                v[i] = max(v[i], 0)
            # Return the volatility (sqrt of variance)
            return np.sqrt(v)

    def simulate(self):
        # no need to check model here, as vol() handles it
        vol_path = self.vol(self.sigma)
        innovations = np.random.normal(0, 1, self.n)
        stock_prices = np.zeros(self.n)
        stock_prices[0] = self.s0

        for i in range(1, self.n):
            stock_prices[i] = stock_prices[i-1] * np.exp(
                (self.r - 0.5 * vol_path[i]**2) * self.dt + vol_path[i] * np.sqrt(self.dt) * innovations[i]
            )
        return stock_prices

    def option_price(self, K):
        stock_prices = self.simulate()
        # European call option payoff at maturity
        payoff = np.maximum(stock_prices[-1] - K, 0)
        return np.exp(-self.r * self.T) * np.mean(payoff)


# Example usage:
if __name__ == "__main__":
    # Using the Heston model
    heston_stock = stock(s0=100, r=0.05, sigma=0.2, T=1.0, n=250, model='heston')
    simulated_prices = heston_stock.simulate()
    price = heston_stock.option_price(K=100)

    print("Simulated Stock Prices (Heston):")
    print(simulated_prices)
    print("\nOption Price (Heston):", price)


Simulated Stock Prices (Heston):
[100.          97.77855336  97.0877602   97.67846846  98.39851349
  98.40933976  98.54160439  98.21772185  98.01932248  96.80721411
  97.82238949  98.77386293  99.2083249   98.91912334  97.78240088
  98.69030889  99.10407195 100.05665888  99.93942357 101.22210555
 101.6959107  100.87876677 102.06812112 101.16878383 103.01358198
 103.63787722 101.76168824 100.93179638 101.47626939 100.19106042
  98.80214381  99.49271029 100.32509691 100.07353023  99.78374195
 100.68868727 100.37989557  99.45404241  97.85176198  96.03437564
  97.29843464  96.36157797  96.78790962  97.54963739  97.13237741
  97.6834944   97.4505998   98.04957137  96.37569084  96.51931495
  97.17080744  97.20214034 100.07574224  99.90159287  99.05192823
  99.18569936 100.89541339 101.08200815 100.31649377  99.27158892
  99.97726763 100.93913613 100.78704549 100.54582744 100.56533894
  99.66350235  99.11911625  98.09001765  97.31603319  94.44446949
  94.70459822  93.8704828   94.14549325  94

In [None]:
# Install a pip package in the current Jupyter kernel
import sys
!{sys.executable} -m pip install redis

# Connect to Redis
r = redis.Redis(host='localhost', port=6379, db=0)

# Set a value in Redis
r.set('Stock', 'NVDA', 'Price', '99.60')

# Get a value from Redis
stock = r.get('stock')
print(stock)


Collecting redis
  Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)
Downloading redis-5.2.1-py3-none-any.whl (261 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m174.1/261.5 kB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.5/261.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: redis
Successfully installed redis-5.2.1


NameError: name 'redis' is not defined

In [4]:
import numpy as np
import gym
from gym import spaces
import torch
import torch.nn as nn
import torch.optim as optim
import torch.distributions as D
from collections import deque
import random
import matplotlib.pyplot as plt

##############################
# Environment Implementation #
##############################

class HedgingEnv(gym.Env):
    """
    Custom hedging environment.

    - Dynamics: Underlying asset follows a geometric Brownian motion.
      If stochastic_vol==True, volatility updates via a simplified SABR model.
    - Reward: At each timestep the reward is the change in wealth adjusted by transaction costs,
      and a risk penalty scaling the variance (approximated here via the squared change).
    - At t=0, the agent buys the replicating portfolio.
    - At maturity, the terminal payoff of a European call is subtracted.
    """
    def __init__(self,
                 T=1.0,            # time horizon (e.g. 1 year)
                 n_steps=50,       # number of timesteps per episode
                 S0=100.0,         # initial asset price
                 sigma0=0.2,       # initial volatility
                 kappa=0.001,      # transaction cost parameter
                 risk_aversion=0.01,  # risk–penalty parameter lambda
                 strike=100.0,     # strike price of the option (European call)
                 nu=0.1,           # vol-of-vol (for SABR dynamics)
                 rho=-0.3,         # correlation between asset and volatility shocks
                 stochastic_vol=True):  # whether to use stochastic volatility dynamics
        super(HedgingEnv, self).__init__()
        self.T = T
        self.n_steps = n_steps
        self.dt = T / n_steps
        self.S0 = S0
        self.sigma0 = sigma0
        self.kappa = kappa
        self.risk_aversion = risk_aversion
        self.strike = strike
        self.nu = nu
        self.rho = rho
        self.stochastic_vol = stochastic_vol

        # Continuous action: hedge position. (We assume it can be any real number.)
        self.action_space = spaces.Box(low=-np.inf, high=np.inf, shape=(1,), dtype=np.float32)
        # State: we use [S, sigma, previous hedge, normalized time]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(4,), dtype=np.float32)
        self.reset()

    def reset(self):
        self.t = 0
        self.S = self.S0
        self.sigma = self.sigma0
        self.a_prev = 0.0  # initial hedge (no position)
        self.state = np.array([self.S, self.sigma, self.a_prev, 0.0], dtype=np.float32)
        return self.state

    def step(self, action):
        action = float(action)  # ensure scalar
        done = False
        info = {}

        # Compute hedge adjustment cost and change in wealth
        trade = self.a_prev - action  # change in hedge
        delta_w = self.S * trade - self.kappa * abs(self.S * trade)
        risk_penalty = self.risk_aversion * (delta_w ** 2)
        reward = delta_w - risk_penalty

        if self.t == 0:
            # At initial step, buying replicating portfolio
            reward = - self.S * action - self.kappa * abs(self.S * action)

        self.t += 1

        # Update underlying dynamics:
        Z1 = np.random.normal()
        if self.stochastic_vol:
            # Stochastic volatility update using a simplified SABR-like model
            Z2 = np.random.normal()
            self.S = self.S * (1 + self.sigma * np.sqrt(self.dt) * Z1)
            self.sigma = self.sigma + self.nu * self.sigma * np.sqrt(self.dt) * (
                self.rho * Z1 + np.sqrt(1 - self.rho ** 2) * Z2)
            # Ensure sigma stays positive
            self.sigma = max(self.sigma, 1e-3)
        else:
            # Constant volatility dynamics
            self.S = self.S * (1 + self.sigma * np.sqrt(self.dt) * Z1)
            # sigma remains constant
            self.sigma = self.sigma0

        self.a_prev = action
        time_frac = self.t / self.n_steps
        if self.t >= self.n_steps:
            # Terminal reward: liquidate position and subtract option payoff.
            option_payoff = max(self.S - self.strike, 0)
            final_delta = self.S * action - self.kappa * abs(self.S * action) - option_payoff
            reward = final_delta
            done = True

        self.state = np.array([self.S, self.sigma, self.a_prev, time_frac], dtype=np.float32)
        return self.state, reward, done, info

##################################
# Helper: Monte Carlo Simulation #
##################################

def simulate_paths(env, N=1000):
    """
    Simulate N Monte Carlo paths using the environment dynamics.
    Each episode represents one path.
    """
    paths = []
    returns = []
    for _ in range(N):
        state = env.reset()
        done = False
        episode_reward = 0.0
        episode_states = [state]
        while not done:
            # For simulation here, use a fixed policy (e.g., do nothing, a=0)
            action = 0.0
            state, reward, done, _ = env.step(action)
            episode_reward += reward
            episode_states.append(state)
        paths.append(episode_states)
        returns.append(episode_reward)
    return paths, returns

#####################################
# Tabular Q-Learning Implementation #
#####################################

class TabularQAgent:
    def __init__(self, env, price_bins=50, action_low=-1.0, action_high=1.0, action_step=0.01,
                 alpha=0.1, gamma=0.99, epsilon=1.0, epsilon_decay=0.995, min_epsilon=0.01):
        self.env = env
        self.price_bins = np.linspace(0, 2 * env.S0, price_bins)
        self.n_time = env.n_steps + 1  # include terminal step
        self.actions = np.arange(action_low, action_high + action_step, action_step)
        self.n_actions = len(self.actions)
        # Q-table indexed by (price_index, time_index)
        self.Q = np.zeros((price_bins, self.n_time, self.n_actions))
        # Hyperparameters
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.min_epsilon = min_epsilon

    def discretize_state(self, state):
        # For tabular Q-learning we use only the price S and discrete time.
        S, _, _, t_frac = state
        price_idx = np.digitize(S, self.price_bins) - 1
        price_idx = np.clip(price_idx, 0, len(self.price_bins)-1)
        time_idx = int(t_frac * self.env.n_steps)
        time_idx = np.clip(time_idx, 0, self.n_time-1)
        return price_idx, time_idx

    def choose_action(self, state):
        price_idx, time_idx = self.discretize_state(state)
        if np.random.rand() < self.epsilon:
            # Explore: choose a random discrete action index.
            a_idx = np.random.randint(self.n_actions)
        else:
            a_idx = np.argmax(self.Q[price_idx, time_idx])
        return self.actions[a_idx], a_idx

    def update(self, state, action_idx, reward, next_state, done):
        s_idx = self.discretize_state(state)
        next_s_idx = self.discretize_state(next_state)
        price_idx, time_idx = s_idx
        n_price_idx, n_time_idx = next_s_idx

        best_next = 0 if done else np.max(self.Q[n_price_idx, n_time_idx])
        td_target = reward + self.gamma * best_next
        td_error = td_target - self.Q[price_idx, time_idx, action_idx]
        self.Q[price_idx, time_idx, action_idx] += self.alpha * td_error

    def decay_epsilon(self):
        self.epsilon = max(self.min_epsilon, self.epsilon * self.epsilon_decay)

def train_tabular_q_learning(env, n_episodes=1000):
    agent = TabularQAgent(env)
    rewards_per_episode = []
    for ep in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            action, action_idx = agent.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            agent.update(state, action_idx, reward, next_state, done)
            total_reward += reward
            state = next_state
        agent.decay_epsilon()
        rewards_per_episode.append(total_reward)
        if ep % 50 == 0:
            print(f"[Tabular Q] Episode {ep}, Total Reward: {total_reward:.2f}, Epsilon: {agent.epsilon:.3f}")
    return rewards_per_episode

###################################
# DDPG Implementation (Lillicrap 2019) #
###################################

# Define Actor and Critic networks for DDPG
class DDPGActor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64):
        super(DDPGActor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, action_dim)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action = self.out(x)
        return action

class DDPGCritic(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64):
        super(DDPGCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, 1)

    def forward(self, state, action):
        x = torch.relu(self.fc1(torch.cat([state, action], dim=1)))
        x = torch.relu(self.fc2(x))
        q_value = self.out(x)
        return q_value

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

def train_ddpg(env, n_episodes=500, batch_size=64, gamma=0.99, tau=0.005):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    actor = DDPGActor(state_dim, action_dim)
    critic = DDPGCritic(state_dim, action_dim)
    target_actor = DDPGActor(state_dim, action_dim)
    target_critic = DDPGCritic(state_dim, action_dim)
    target_actor.load_state_dict(actor.state_dict())
    target_critic.load_state_dict(critic.state_dict())

    actor_optimizer = optim.Adam(actor.parameters(), lr=1e-3)
    critic_optimizer = optim.Adam(critic.parameters(), lr=1e-3)

    buffer = ReplayBuffer()
    rewards_per_episode = []

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        total_reward = 0.0
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            action = actor(state_tensor).detach().numpy()[0]
            # Add exploratory noise (Gaussian)
            action = action + np.random.normal(0, 0.1, size=action_dim)
            next_state, reward, done, _ = env.step(action[0])
            buffer.push(state, action, reward, next_state, done)
            total_reward += reward
            state = next_state

            # Update if enough samples are available
            if len(buffer) > batch_size:
                states, actions, rewards, next_states, dones = buffer.sample(batch_size)
                states_tensor = torch.FloatTensor(states)
                actions_tensor = torch.FloatTensor(actions)
                rewards_tensor = torch.FloatTensor(rewards).unsqueeze(1)
                next_states_tensor = torch.FloatTensor(next_states)
                dones_tensor = torch.FloatTensor(dones).unsqueeze(1)

                # Critic update
                next_actions = target_actor(next_states_tensor)
                target_q = target_critic(next_states_tensor, next_actions)
                expected_q = rewards_tensor + gamma * (1 - dones_tensor) * target_q
                current_q = critic(states_tensor, actions_tensor)
                critic_loss = nn.MSELoss()(current_q, expected_q.detach())
                critic_optimizer.zero_grad()
                critic_loss.backward()
                critic_optimizer.step()

                # Actor update
                actor_loss = -critic(states_tensor, actor(states_tensor)).mean()
                actor_optimizer.zero_grad()
                actor_loss.backward()
                actor_optimizer.step()

                # Update target networks
                for target_param, param in zip(target_actor.parameters(), actor.parameters()):
                    target_param.data.copy_(tau*param.data + (1-tau)*target_param.data)
                for target_param, param in zip(target_critic.parameters(), critic.parameters()):
                    target_param.data.copy_(tau*param.data + (1-tau)*target_param.data)

        rewards_per_episode.append(total_reward)
        if episode % 20 == 0:
            print(f"[DDPG] Episode {episode}, Total Reward: {total_reward:.2f}")
    return rewards_per_episode

###########################################
# PPO Implementation (Schulman et al. 2017) #
###########################################

class PPOActor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_size=64):
        super(PPOActor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.mean = nn.Linear(hidden_size, action_dim)
        # Log_std parameter for Gaussian exploration
        self.log_std = nn.Parameter(torch.zeros(action_dim))

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        mean = self.mean(x)
        std = torch.exp(self.log_std)
        return mean, std

class PPOCritic(nn.Module):
    def __init__(self, state_dim, hidden_size=64):
        super(PPOCritic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, 1)

    def forward(self, state):
        x = torch.tanh(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        value = self.out(x)
        return value

def train_ppo(env, n_episodes=500, clip_epsilon=0.2, gamma=0.99, lr=3e-4, update_epochs=5):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    actor = PPOActor(state_dim, action_dim)
    critic = PPOCritic(state_dim)
    optimizer = optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=lr)

    rewards_per_episode = []

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        trajectory = []
        total_reward = 0.0
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            mean, std = actor(state_tensor)
            dist = D.Normal(mean, std)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            next_state, reward, done, _ = env.step(action.item())
            trajectory.append((state, action.item(), reward, log_prob.item()))
            total_reward += reward
            state = next_state

        # Compute rewards-to-go
        returns = []
        G = 0
        for (_, _, r, _) in reversed(trajectory):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.FloatTensor(returns)
        # Normalize returns
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        # Convert trajectory to tensors
        states = torch.FloatTensor([s for (s, a, r, lp) in trajectory])
        actions = torch.FloatTensor([[a] for (s, a, r, lp) in trajectory])
        old_log_probs = torch.FloatTensor([[lp] for (s, a, r, lp) in trajectory])

        # Multiple update epochs on the same trajectory
        for _ in range(update_epochs):
            mean, std = actor(states)
            dist = D.Normal(mean, std)
            new_log_probs = dist.log_prob(actions)
            new_log_probs = new_log_probs.unsqueeze(1)
            ratio = torch.exp(new_log_probs - old_log_probs)

            values = critic(states)
            advantages = returns.unsqueeze(1) - values.detach()
            # PPO clipped objective
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = nn.MSELoss()(values, returns.unsqueeze(1))

            loss = actor_loss + 0.5 * critic_loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        rewards_per_episode.append(total_reward)
        if episode % 20 == 0:
            print(f"[PPO] Episode {episode}, Total Reward: {total_reward:.2f}")
    return rewards_per_episode

#############################################
# GRPO Implementation (DeepSeek 2024 variant)#
#############################################

# For this demonstration, we assume GRPO augments the PPO update with an additional gradient–reward regularization term.
# The implementation here is similar in structure to PPO with an extra term that penalizes large deviations in the gradient.
def train_grpo(env, n_episodes=500, clip_epsilon=0.2, gamma=0.99, lr=3e-4,
               update_epochs=5, grad_reg_coeff=0.1):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    actor = PPOActor(state_dim, action_dim)
    critic = PPOCritic(state_dim)
    optimizer = optim.Adam(list(actor.parameters()) + list(critic.parameters()), lr=lr)

    rewards_per_episode = []

    for episode in range(n_episodes):
        state = env.reset()
        done = False
        trajectory = []
        total_reward = 0.0
        while not done:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            mean, std = actor(state_tensor)
            dist = D.Normal(mean, std)
            action = dist.sample()
            log_prob = dist.log_prob(action)
            next_state, reward, done, _ = env.step(action.item())
            trajectory.append((state, action.item(), reward, log_prob.item()))
            total_reward += reward
            state = next_state

        returns = []
        G = 0
        for (_, _, r, _) in reversed(trajectory):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.FloatTensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        states = torch.FloatTensor([s for (s, a, r, lp) in trajectory])
        actions = torch.FloatTensor([[a] for (s, a, r, lp) in trajectory])
        old_log_probs = torch.FloatTensor([[lp] for (s, a, r, lp) in trajectory])

        for _ in range(update_epochs):
            mean, std = actor(states)
            dist = D.Normal(mean, std)
            new_log_probs = dist.log_prob(actions)
            new_log_probs = new_log_probs.unsqueeze(1)
            ratio = torch.exp(new_log_probs - old_log_probs)

            values = critic(states)
            advantages = returns.unsqueeze(1) - values.detach()
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * advantages
            actor_loss = -torch.min(surr1, surr2).mean()
            critic_loss = nn.MSELoss()(values, returns.unsqueeze(1))

            # Additional gradient regularization (a penalty on the norm of the actor gradient)
            optimizer.zero_grad()
            loss = actor_loss + 0.5 * critic_loss
            loss.backward()
            grad_norm = 0.0
            for param in actor.parameters():
                if param.grad is not None:
                    grad_norm += param.grad.data.norm(2)
            # Add penalty term
            loss = loss + grad_reg_coeff * grad_norm
            optimizer.step()

        rewards_per_episode.append(total_reward)
        if episode % 20 == 0:
            print(f"[GRPO] Episode {episode}, Total Reward: {total_reward:.2f}")
    return rewards_per_episode

####################################
# Main comparison and experiment   #
####################################

def main():
    # Number of Monte Carlo paths is given by the number of episodes we simulate.
    n_mc = 500  # for demonstration we use 500 episodes per algorithm
    print("======== Running Tabular Q-Learning (Constant Volatility) ========")
    env_const = HedgingEnv(stochastic_vol=False)
    rewards_q = train_tabular_q_learning(env_const, n_episodes=n_mc)

    print("\n======== Running DDPG (Constant Volatility) ========")
    env_const = HedgingEnv(stochastic_vol=False)
    rewards_ddpg = train_ddpg(env_const, n_episodes=n_mc)

    print("\n======== Running PPO (Constant Volatility) ========")
    env_const = HedgingEnv(stochastic_vol=False)
    rewards_ppo = train_ppo(env_const, n_episodes=n_mc)

    print("\n======== Running GRPO (Constant Volatility) ========")
    env_const = HedgingEnv(stochastic_vol=False)
    rewards_grpo = train_grpo(env_const, n_episodes=n_mc)

    # Plot convergence curves for constant volatility
    plt.figure(figsize=(10,6))
    plt.plot(rewards_q, label="Tabular Q-Learning")
    plt.plot(rewards_ddpg, label="DDPG")
    plt.plot(rewards_ppo, label="PPO")
    plt.plot(rewards_grpo, label="GRPO")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Performance Comparison (Constant Volatility)")
    plt.legend()
    plt.show()

    # Now repeat with stochastic volatility dynamics.
    print("\n======== Running Tabular Q-Learning (Stochastic Volatility) ========")
    env_stoch = HedgingEnv(stochastic_vol=True)
    rewards_q_stoch = train_tabular_q_learning(env_stoch, n_episodes=n_mc)

    print("\n======== Running DDPG (Stochastic Volatility) ========")
    env_stoch = HedgingEnv(stochastic_vol=True)
    rewards_ddpg_stoch = train_ddpg(env_stoch, n_episodes=n_mc)

    print("\n======== Running PPO (Stochastic Volatility) ========")
    env_stoch = HedgingEnv(stochastic_vol=True)
    rewards_ppo_stoch = train_ppo(env_stoch, n_episodes=n_mc)

    print("\n======== Running GRPO (Stochastic Volatility) ========")
    env_stoch = HedgingEnv(stochastic_vol=True)
    rewards_grpo_stoch = train_grpo(env_stoch, n_episodes=n_mc)

    # Plot convergence curves for stochastic volatility
    plt.figure(figsize=(10,6))
    plt.plot(rewards_q_stoch, label="Tabular Q-Learning")
    plt.plot(rewards_ddpg_stoch, label="DDPG")
    plt.plot(rewards_ppo_stoch, label="PPO")
    plt.plot(rewards_grpo_stoch, label="GRPO")
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Performance Comparison (Stochastic Volatility)")
    plt.legend()
    plt.show()

if __name__ == "__main__":
    main()


[Tabular Q] Episode 0, Total Reward: -2260.85, Epsilon: 0.995
[Tabular Q] Episode 50, Total Reward: -2682.09, Epsilon: 0.774
[Tabular Q] Episode 100, Total Reward: -5369.04, Epsilon: 0.603
[Tabular Q] Episode 150, Total Reward: -3221.20, Epsilon: 0.469
[Tabular Q] Episode 200, Total Reward: -2881.90, Epsilon: 0.365
[Tabular Q] Episode 250, Total Reward: -4953.80, Epsilon: 0.284
[Tabular Q] Episode 300, Total Reward: -3136.11, Epsilon: 0.221
[Tabular Q] Episode 350, Total Reward: -3065.78, Epsilon: 0.172
[Tabular Q] Episode 400, Total Reward: -1207.62, Epsilon: 0.134
[Tabular Q] Episode 450, Total Reward: -719.54, Epsilon: 0.104
[Tabular Q] Episode 500, Total Reward: -754.68, Epsilon: 0.081
[Tabular Q] Episode 550, Total Reward: -1046.49, Epsilon: 0.063
[Tabular Q] Episode 600, Total Reward: -849.06, Epsilon: 0.049
[Tabular Q] Episode 650, Total Reward: -1896.88, Epsilon: 0.038
[Tabular Q] Episode 700, Total Reward: -253.64, Epsilon: 0.030
[Tabular Q] Episode 750, Total Reward: -33.20, 

KeyboardInterrupt: 