## Reinforcement Learning Optimizer

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import random

# --- Environment ---
class EnergyTradingEnv:
    def __init__(self, T=1.0, dt=0.05):
        self.T = T
        self.dt = dt
        self.steps = int(T / dt)
        self.reset()

    def reset(self):
        self.t = 0
        self.Y = 50.0  # mid price
        self.Z = 0.0   # net position
        self.state = np.array([self.Y, self.Z, self.t * self.dt], dtype=np.float32)
        return self.state

    def step(self, q):
        h = 0.1
        varphi = 0.01 * q ** 2
        psi = 0.05 * q
        sigma_Y = 1.0
        sigma_D = 1.0

        P = self.Y + np.sign(q) * h + varphi
        reward = -q * P

        dW_Y = np.random.randn() * np.sqrt(self.dt)
        dW_D = np.random.randn() * np.sqrt(self.dt)

        self.Y += (0.0 + psi) * self.dt + sigma_Y * dW_Y
        self.Z += q * self.dt + sigma_D * dW_D

        self.t += 1
        done = self.t >= self.steps

        next_state = np.array([self.Y, self.Z, self.t * self.dt], dtype=np.float32)

        if done:
            reward += -self.terminal_cost(self.Y, self.Z)

        return next_state, reward, done, {}

    def terminal_cost(self, Y, Z):
        # optimize over xi: simple quadratic cost C(xi) = xi^2, alpha = 0.1 * I_T^2
        def g(xi):
            I_T = xi + Z
            C = xi ** 2
            imbalance_cost = I_T * (Y + np.sign(I_T) * 0.1 + 0.1 * I_T)
            return C - imbalance_cost

        xi_range = np.linspace(0, 10, 100)
        costs = np.array([g(xi) for xi in xi_range])
        return np.min(costs)


# --- Actor and Critic Networks ---
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, action_dim),
            nn.Tanh()  # Output in [-1, 1]
        )

    def forward(self, x):
        return self.net(x) * 5  # scale to trading rate range


class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(state_dim + action_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x, a):
        return self.net(torch.cat([x, a], dim=-1))


# --- DDPG Agent ---
class DDPG:
    def __init__(self, state_dim, action_dim):
        self.actor = Actor(state_dim, action_dim)
        self.actor_target = Actor(state_dim, action_dim)
        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)

        self.actor_target.load_state_dict(self.actor.state_dict())
        self.critic_target.load_state_dict(self.critic.state_dict())

        self.actor_optim = optim.Adam(self.actor.parameters(), lr=1e-3)
        self.critic_optim = optim.Adam(self.critic.parameters(), lr=1e-3)

        self.memory = deque(maxlen=100000)
        self.batch_size = 64
        self.gamma = 0.99
        self.tau = 0.005

    def act(self, state, noise=0.1):
        state = torch.FloatTensor(state).unsqueeze(0)
        action = self.actor(state).detach().numpy()[0]
        return action + noise * np.random.randn(*action.shape)

    def remember(self, *args):
        self.memory.append(args)

    def update(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        state, action, reward, next_state, done = zip(*batch)

        state = torch.FloatTensor(state)
        action = torch.FloatTensor(action)
        reward = torch.FloatTensor(reward).unsqueeze(1)
        next_state = torch.FloatTensor(next_state)
        done = torch.FloatTensor(done).unsqueeze(1)

        # Critic update
        with torch.no_grad():
            target_q = self.critic_target(next_state, self.actor_target(next_state))
            target = reward + self.gamma * (1 - done) * target_q

        current_q = self.critic(state, action)
        critic_loss = nn.MSELoss()(current_q, target)

        self.critic_optim.zero_grad()
        critic_loss.backward()
        self.critic_optim.step()

        # Actor update
        actor_loss = -self.critic(state, self.actor(state)).mean()

        self.actor_optim.zero_grad()
        actor_loss.backward()
        self.actor_optim.step()

        # Soft update
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)

        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)


# --- Training Loop ---
if __name__ == '__main__':
    env = EnergyTradingEnv()
    agent = DDPG(state_dim=3, action_dim=1)

    episodes = 500
    for ep in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action[0])
            agent.remember(state, action, reward, next_state, float(done))
            agent.update()
            state = next_state
            total_reward += reward

        if ep % 25 == 0:
            print(f"Episode {ep}, Total Reward: {total_reward:.2f}")

Episode 0, Total Reward: 5104.03


  state = torch.FloatTensor(state)


Episode 25, Total Reward: 5211.10
Episode 50, Total Reward: 5173.54
Episode 75, Total Reward: 5079.84
Episode 100, Total Reward: 5163.22
Episode 125, Total Reward: 5071.87
Episode 150, Total Reward: 5094.73
Episode 175, Total Reward: 5224.25
Episode 200, Total Reward: 5082.18
Episode 225, Total Reward: 5140.54
Episode 250, Total Reward: 5335.52
Episode 275, Total Reward: 5204.58
Episode 300, Total Reward: 5162.30
Episode 325, Total Reward: 5100.10
Episode 350, Total Reward: 5026.66
Episode 375, Total Reward: 5048.24
Episode 400, Total Reward: 5116.40
Episode 425, Total Reward: 5218.49
Episode 450, Total Reward: 5190.51
Episode 475, Total Reward: 5124.24
