In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, num_features, num_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_actions)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.softmax(x, dim=-1)

# Constants
num_features = ...  # Define the size of the observation space
num_actions = ...  # Define the total number of actions

# Create the policy network
policy = PolicyNetwork(num_features, num_actions)
optimizer = optim.Adam(policy.parameters(), lr=0.01)

# Function to compute discounted rewards
def discount_rewards(rewards, gamma):
    ...

# Training function
def train_policy_gradient(policy, optimizer, states, actions, rewards):
    discounted_rewards = discount_rewards(rewards, gamma)
    
    policy_loss = []
    for state, action, reward in zip(states, actions, discounted_rewards):
        state = torch.tensor(state, dtype=torch.float32)
        probs = policy(state)
        m = Categorical(probs)
        loss = -m.log_prob(action) * reward
        policy_loss.append(loss)
    
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

# Assume we have an environment simulator with the following interface
class EnvironmentSimulator:
    def reset(self):
        ...
    
    def step(self, action):
        ...

# Training loop
num_episodes = 1000
gamma = 0.99

env_simulator = EnvironmentSimulator()

for episode in range(num_episodes):
    state = env_simulator.reset()
    states, actions, rewards = [], [], []
    done = False
    
    while not done:
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        probs = policy(state_tensor)
        m = Categorical(probs)
        action = m.sample()
        
        states.append(state)
        actions.append(action)
        next_state, reward, done = env_simulator.step(action.item())
        rewards.append(reward)
        
        state = next_state
        
    train_policy_gradient(policy, optimizer, states, actions, rewards)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Constants
num_systems = 4  # This can be dynamic based on the systems you want to operate
num_continuous_controls = 5  # Number of variables with continuous control (+-0.1)
num_discrete_controls = 3  # Number of variables with discrete control (natural numbers)
num_features = ...  #Number of features from the environment table, which would be the number of columns minus static data like 'MM-DD hour' and 'System Identifier'

num_continuous_actions_per_control = 3 #Assuming each continuous control can either stay the same, increase, or decrease
num_chiller_states = ... # The number of states for the chiller (e.g., off, 1, 2, ...)
num_heat_exchanger_states = 2 # The number of states for the heat exchanger
num_cooling_towers = 4 # Number of cooling towers you can operate

num_actions = (num_continuous_controls * num_continuous_actions_per_control) + num_chiller_states + num_heat_exchanger_states + num_cooling_towers
learning_rate = 0.01
gamma = 0.99 # Discount factor for rewards

class PolicyNetwork(nn.Module):
    def init(self, num_features, num_actions):
        super(PolicyNetwork, self).init()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_actions)
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return torch.softmax(x, dim=-1)
    policy = PolicyNetwork(num_features, num_actions)
    optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

def discount_rewards(rewards, gamma):
# Function implementation to calculate discounted rewards
...

def train_policy_gradient(policy, optimizer, states, actions, rewards):
discounted_rewards = discount_rewards(rewards, gamma)
policy_loss = []
for state, action, reward in zip(states, actions, discounted_rewards):
    state = torch.tensor(state, dtype=torch.float32)
    probs = policy(state)
    m = Categorical(probs)
    loss = -m.log_prob(action) * reward
    policy_loss.append(loss)

optimizer.zero_grad()
policy_loss = torch.stack(policy_loss).sum()
policy_loss.backward()
optimizer.step()

def simulate_environment(systems_to_operate, actions):
    # Function to apply actions to systems and return new state and reward
    ...

#train
num_episodes = 1000

for episode in range(num_episodes):
# Retrieve current state of the environment
    current_state = ... # Current state for all systems
    states, actions, rewards = [], [], []
    done = False


    while not done:
        state_tensor = torch.from_numpy(current_state).float().unsqueeze(0)
        probs = policy(state_tensor)
        m = Categorical(probs)
        action = m.sample()

        states.append(current_state)
        actions.append(action)

        # Apply the action to the environment and retrieve next state and reward
        next_state, reward, done = simulate_environment(num_systems, action)

        rewards.append(reward)
        current_state = next_state

        if done:
            break

train_policy_gradient(policy, optimizer, states, actions, rewards)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical

# Constants
num_systems = 4  # We have 4 identical systems
num_continuous_controls = 5  # Variables that can be controlled continuously
num_continuous_actions_per_control = 3  # Each can stay the same, increase, or decrease

# num_discrete_controls = 1  # Chiller can only be turned on or off
# num_towers_per_system = 4  # Each system has 4 towers that can be turned on or off
# num_plate_exchanger_states = 2  # The plate exchanger can be on or off

# # Since there are 4 towers with on/off states, we calculate the total discrete action space for the towers
# num_tower_actions = 2 ** num_towers_per_system  # Each tower can be either on or off

# # Total number of actions for the continuous controls
# total_continuous_actions = num_continuous_controls * num_continuous_actions_per_control

# # Total number of actions for the discrete controls
# total_discrete_actions = num_discrete_controls + num_tower_actions + num_plate_exchanger_states


num_chillers_per_system = 1  # One chiller per system, with on/off states
num_towers_per_system = 4  # Four towers per system, with on/off states for each
num_plate_exchangers_per_system = 1  # One plate exchanger per system, with on/off states

# Each chiller, tower, and plate exchanger can be on or off, so there are 2 states for each
num_discrete_chiller_actions = 2 ** (num_chillers_per_system * num_systems)
num_discrete_tower_actions = 2 ** (num_towers_per_system * num_systems)
num_discrete_exchanger_actions = 2 ** (num_plate_exchangers_per_system * num_systems)

# Summing all discrete actions
num_discrete_actions = num_discrete_chiller_actions + num_discrete_tower_actions + num_discrete_exchanger_actions


num_actions = num_discrete_actions + num_continuous_actions

num_features = ...  # Number of observation features from the environment state




# Hyperparameters
learning_rate = 0.01
gamma = 0.99  # Discount factor for rewards

# Policy network
class PolicyNetwork(nn.Module):
    def __init__(self, num_features, num_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(num_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_actions)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return torch.softmax(self.fc3(x), dim=-1)

# Instantiate the policy network and the optimizer
policy = PolicyNetwork(num_features, num_actions)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# Function to calculate discounted rewards
def discount_rewards(rewards, gamma):
    # Implementation goes here
    ...

# Training function for policy gradients
def train_policy_gradient(policy, optimizer, states, actions, rewards):
    discounted_rewards = discount_rewards(rewards, gamma)
    policy_loss = []
    for state, action, reward in zip(states, actions, discounted_rewards):
        state = torch.tensor(state, dtype=torch.float32)
        probs = policy(state)
        # We will need to expand this for the multi-action space
        action_distribution = Categorical(probs)
        loss = -action_distribution.log_prob(action) * reward
        policy_loss.append(loss)

    optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()

# Function to simulate the environment
def simulate_environment(systems_to_operate, actions):
    # Apply actions to the systems and return the new state and reward
    ...

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    # Retrieve the current state of the environment
    current_state = ...  # Fetch the current state for all systems from the environment
    states, actions, rewards = [], [], []
    done = False

    while not done:
        state_tensor = torch.from_numpy(current_state).float().unsqueeze(0)
        probs = policy(state_tensor)
        # We will need to adapt the action selection to our multi-action space
        action_distribution = Categorical(probs)
        action = action_distribution.sample()

        states.append(current_state)
        actions.append(action.item())  # Assuming a single discrete action space

        # Apply the action to the environment and retrieve the next state and reward
        next_state, reward, done = simulate_environment(num_systems, action.item())

        rewards.append(reward)
        current_state = next_state

        if done:
            break

    train_policy_gradient(policy, optimizer, states, actions, rewards)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical, Normal

# Constants
num_systems = 4
num_continuous_controls = 5

num_freature = 23*4

# For each system:
num_chiller_actions = 2  # On or off
num_plate_exchanger_actions = 2  # On or off
num_cooling_tower_combinations = 2 ** num_towers_per_system  # Each can be on or off

# The action to turn the entire system off (master switch)
num_system_off_action = 2

# For num_systems systems, the discrete action space is:
num_discrete_controls_per_system = num_chiller_actions + num_plate_exchanger_actions + num_cooling_tower_combinations
num_discrete_controls = ((num_discrete_controls_per_system+num_continuous_controls+num_system_off_action) * num_systems) 



# Hyperparameters
learning_rate = 0.01
gamma = 0.99  # Discount factor for rewards

# Policy network
class PolicyNetwork(nn.Module):
    def __init__(self, num_features, num_continuous_controls, num_discrete_controls):
        super(PolicyNetwork, self).__init__()
        # Shared layers
        self.shared_layers = nn.Sequential(
            nn.Linear(num_features, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )
        
        # Branch for continuous controls
        self.continuous_head = nn.Sequential(
            nn.Linear(64, num_continuous_controls * 2)  # Mean and std dev for each control
        )
        
        # Branch for discrete controls
        self.discrete_head = nn.Sequential(
            nn.Linear(64, num_discrete_controls)  # Probabilities for discrete actions
        )
        
    def forward(self, x):
        x = self.shared_layers(x)
        
        # Continuous actions output
        continuous_actions = self.continuous_head(x)
        means = continuous_actions[:, :num_continuous_controls]
        std_devs = torch.clamp(continuous_actions[:, num_continuous_controls:], min=1e-3)
        
        # Discrete actions output
        discrete_logits = self.discrete_head(x)
        
        return means, std_devs, discrete_logits

# Instantiate the policy network and the optimizer
policy = PolicyNetwork(num_features, num_continuous_controls, num_discrete_controls)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)


def discount_rewards(rewards, gamma):
    

# Training function for policy gradients
def train_policy_gradient(policy, optimizer, states, actions, rewards):
    discounted_rewards = discount_rewards(rewards, gamma)
    policy_loss = []
    for state, action, reward in zip(states, actions, discounted_rewards):
        state = torch.tensor(state, dtype=torch.float32)
        probs = policy(state)
        # We will need to expand this for the multi-action space
        action_distribution = Categorical(probs)
        loss = -action_distribution.log_prob(action) * reward
        policy_loss.append(loss)

    optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    
# Assume we have an environment simulator with the following interface
class EnvironmentSimulator:
    

# Training loop
num_episodes = 1000
for episode in range(num_episodes):
    # Retrieve the current state of the environment
    current_state = ...  # Fetch the current state for all systems from the environment
    states, actions, rewards = [], [], []
    done = False    
    while not done:
        state_tensor = torch.from_numpy(current_state).float().unsqueeze(0)
        means, std_devs, discrete_logits = policy(state_tensor)
        
        # Sample from the distributions for continuous actions
        continuous_distribution = Normal(means, std_devs)
        continuous_actions = continuous_distribution.sample()
        
        # Sample from the distribution for discrete actions
        discrete_distribution = Categorical(logits=discrete_logits)
        discrete_actions = discrete_distribution.sample()

        # You'll need to process these actions into a valid format for your environment
        # ...
        states.append(current_state)
        actions.append([continuous_actions.item(),discrete_actions.item()])  # Assuming a single discrete action space

        # Apply the action to the environment and retrieve the next state and reward
        next_state, reward, done = simulate_environment(num_systems, action.item())

        rewards.append(reward)
        current_state = next_state

        if done:
            break

    train_policy_gradient(policy, optimizer, states, actions, rewards)


    
for episode in range(num_episodes):
    # Retrieve the current state of the real environment
    current_state = get_real_system_state()  # This function needs to be implemented to fetch real-world data

    # Initialize episode memory
    states, actions, rewards = [], [], []
    done = False
    
    while not done:
        state_tensor = torch.from_numpy(current_state).float().unsqueeze(0)
        means, std_devs, discrete_logits = policy(state_tensor)
        
        # Sample from the distributions for continuous actions
        continuous_distribution = Normal(means, std_devs)
        continuous_actions = continuous_distribution.sample()
        
        # Apply constraints to continuous actions here if necessary
        # ...

        # Sample from the distribution for discrete actions
        discrete_distribution = Categorical(logits=discrete_logits)
        discrete_actions = discrete_distribution.sample()

        # Combine and possibly constrain the discrete actions here
        # ...

        # Apply the actions to the real system and get the new state and reward
        # Ensure this is done in a safe manner with proper error checking
        # next_state, reward = apply_actions_to_real_system(continuous_actions, discrete_actions)
        # ...

        states.append(current_state)
        actions.append((continuous_actions, discrete_actions))  # Store actions taken

        # Calculate reward based on the power consumption difference
        # reward = calculate_reward_from_power_consumption(current_state, next_state)
        # ...

        rewards.append(reward)
        current_state = next_state

        # Implement your stopping criteria
        # ...

    # Update policy after each episode or after collecting enough data
    train_policy_gradient(policy, optimizer, states, actions, rewards)