In [1]:
import random
from pathlib import Path

import numpy as np
import torch
from citylearn.citylearn import CityLearnEnv
from matplotlib import pyplot as plt


from agents.base_agent import Agent
from agents.random_agent import RandomAgent
from agents.sac import SACAgent

In [5]:
decentralized_observation_space_dim = 29
decentralized_action_space_dim = 6 
building_number = 3
    
    # Initialize SAC Agent

agents = []
for i in range(building_number):
    agents.append(
        SACAgent(
            observation_space_dim=decentralized_observation_space_dim, 
            action_space_dim=decentralized_action_space_dim,
            hidden_dim=256,
            buffer_size=100000,
            batch_size=256,
            learning_rate=3e-4,
            gamma=0.99,
            tau=0.01,
            alpha=0.05,
            action_space=decentralized_env.action_space,
            exploration_timesteps = 0
        )
    )


    
    # Train the agent
rewards = train_sac_agent(decentralized_env, agents, episodes=3, decentralized=True)

  self.load_state_dict(torch.load(self.filepath)['model_state_dict'])


29
-44620.4802984519
Episode 1/3, Total Reward: -44620.4802984519
29
-86011.0707178354
Episode 2/3, Total Reward: -41390.59041938351
29
-129099.5891906464
Episode 3/3, Total Reward: -43088.51847281098


In [2]:
def train_sac_agent(
    env: CityLearnEnv, 
    agents: list[SACAgent], 
    episodes: int = 100, 
    decentralized = False
) -> None:
    """Train SAC agent in the environment"""
    total_reward = 0
    
    reward_list = []
    
    for episode in range(episodes):
        # Reset environment and get initial observation
        observation = env.reset()
        print(len(observation[0]))

        episode_reward = 0
        
        while not env.done:
            if not decentralized:
                flat_observation = np.concatenate(observation) if isinstance(observation, list) else observation

            
            # select actions based on different paradigms
            if decentralized:
                actions = [0 for _ in range(len(agents))]
                
                for i in range(len(agents)):
                    # agent_actions is used for the replay buffer
                    actions[i] = agents[i].select_action(observation[i]).tolist()
                    # ADD THIS TO REPLAY BUFFER SAMPLING ASW
            else:
                actions = [agents[0].select_action(flat_observation).tolist()]
            
            #print(f"actions: {actions}") # action is a list of lists (one for each agent) of actions)
            for agent in agents:
                agent.total_steps += 1
                    
            next_observation, reward, info, done = env.step(actions)
            
            reward_list.append(reward)

            if not decentralized:       
                flat_next_observation = np.concatenate(next_observation) if isinstance(next_observation, list) else next_observation
            
            episode_reward += np.sum(reward)

            if decentralized:
                for i in range(len(agents)):
                    agents[i].replay_buffer.push(
                        observation[i], 
                        actions[i], 
                        np.sum(reward),
                        next_observation[i], 
                        len(done)
                    )
            else:               
                agents[0].replay_buffer.push(
                    flat_observation, 
                    actions, 
                    np.sum(reward),
                    flat_next_observation, 
                    len(done)
                )
            
            if agent.total_steps >= agent.exploration_timesteps:
                agent.train()
            
            
            observation = next_observation
        
        total_reward += episode_reward
        print(total_reward)
        
        print(f"Episode {episode+1}/{episodes}, Total Reward: {episode_reward}")
    
    return reward_list

In [4]:
SEED = 0
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
root_directory = Path("../data/citylearn_challenge_2023_phase_1")
schema_path = root_directory / "schema.json"

centralized_env = CityLearnEnv(
    schema=schema_path,
    root_directory=root_directory,
    random_seed=SEED,
    central_agent=True,
)

decentralized_env = CityLearnEnv(
    schema=schema_path,
    root_directory=root_directory,
    random_seed=SEED,
    central_agent=False,
)


  self.load_state_dict(torch.load(self.filepath)['model_state_dict'])


In [None]:
centralized_centralized_observation_space_dim = 49 
centralized_action_space_dim = 18 # set to 18 and turn on other actions
    
    
    # Initialize SAC Agent
sac_agent = SACAgent(
    observation_space_dim=centralized_centralized_observation_space_dim, 
    action_space_dim=centralized_action_space_dim,
    hidden_dim=256,
    buffer_size=100000,
    batch_size=256,
    learning_rate=3e-4,
    gamma=0.99,
    tau=0.01,
    alpha=0.05,
    action_space=env.action_space,
    exploration_timesteps = 0
)


    
    # Train the agent
rewards = train_sac_agent(centralized_env, sac_agent, episodes=3)

In [9]:
def centralized_interact_with_env(
    env: CityLearnEnv, agent: Agent = RandomAgent, episodes: int = 100
) -> None:
    """Interact with environment using agent"""
    reward_list = []
    for episode in range(episodes):
        observation = env.reset()
        while not env.done:
            action = agent.select_action(observation)
            observation, reward, info, done = env.step(action)
            reward_list.append(reward)
    
    return reward_list

In [None]:
def decentralized_interact_with_env(
    env: CityLearnEnv, agent: Agent = RandomAgent, episodes: int = 100
) -> None:
    """Interact with environment using agent"""
    reward_list = []
    for episode in range(episodes):
        observation = env.reset()
        while not env.done:
            action = agent.select_action(observation)
            observation, reward, info, done = env.step(action)
            reward_list.append(reward)
    
    return reward_list

In [6]:
# random_agent = RandomAgent(env.observation_space, env.action_space)
# random_rewards = centralized_interact_with_env(env, random_agent, episodes=2)

In [10]:
flat = np.concatenate(rewards)

In [47]:
import matplotlib
matplotlib.use('inline')
plt.figure(figsize=(10, 6))
plt.plot(flat, 'k')
plt.savefig("test.png")

In [13]:
import pandas as pd
def format_evaluation(evaluation_data: dict) -> pd.DataFrame:
    kpis = pd.DataFrame.from_dict(
        evaluation_data, orient="index", columns=["value", "display_name", "weight"]
    )
    kpis_reset = kpis.reset_index().rename(columns={"index": "metric"})
    return kpis_reset

In [14]:
format_evaluation(env.evaluate_citylearn_challenge())

Unnamed: 0,metric,value,display_name,weight
0,carbon_emissions_total,0.885068,Carbon emissions,0.1
1,discomfort_proportion,0.886691,Unmet hours,0.3
2,ramping_average,1.438163,Ramping,0.075
3,daily_one_minus_load_factor_average,1.112994,Load factor,0.075
4,daily_peak_average,0.930902,Daily peak,0.075
5,annual_peak_average,0.867638,All-time peak,0.075
6,one_minus_thermal_resilience_proportion,,Thermal resilience,0.15
7,power_outage_normalized_unserved_energy_total,,Unserved energy,0.15
8,average_score,0.680741,Score,
