In [None]:
from citylearn import  CityLearn, building_loader, auto_size
from energy_models import HeatPump, EnergyStorage, Building
import matplotlib.pyplot as plt
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import collections
import gym
from gym.utils import seeding
from gym import core, spaces
import os
import ptan
import time
import argparse
import model, common
from matplotlib.pyplot import figure
import numpy as np
from reward_function import reward_function

In [None]:
from pathlib import Path

data_folder = Path("data/")

demand_file = data_folder / "AustinResidential_TH.csv"
weather_file = data_folder / 'Austin_Airp_TX-hour.csv'

In [None]:
building_ids = [4, 5, 9, 16, 21, 26, 33, 36, 49, 59]

In [None]:
heat_pump, heat_tank, cooling_tank = {}, {}, {}

#Ref: Assessment of energy efficiency in electric storage water heaters (2008 Energy and Buildings)
loss_factor = 0.19/24
buildings = []
for uid in building_ids:
    heat_pump[uid] = HeatPump(nominal_power = 9e12, eta_tech = 0.22, t_target_heating = 45, t_target_cooling = 10)
    heat_tank[uid] = EnergyStorage(capacity = 9e12, loss_coeff = loss_factor)
    cooling_tank[uid] = EnergyStorage(capacity = 9e12, loss_coeff = loss_factor)
    buildings.append(Building(uid, heating_storage = heat_tank[uid], cooling_storage = cooling_tank[uid], heating_device = heat_pump[uid], cooling_device = heat_pump[uid]))
    buildings[-1].state_space(np.array([24.0, 40.0, 1.001]), np.array([1.0, 17.0, -0.001]))
    buildings[-1].action_space(np.array([0.5]), np.array([-0.5]))
    
building_loader(demand_file, weather_file, buildings)  
auto_size(buildings, t_target_heating = 45, t_target_cooling = 10)
env = CityLearn(demand_file, weather_file, buildings = buildings, time_resolution = 1, simulation_period = (3500,6000))

In [None]:
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import collections
import numpy as np
import random
import copy

In [None]:
class DDPGActor(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGActor, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
            nn.Linear(4, act_size),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)

class DDPGCritic(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGCritic, self).__init__()

        self.obs_net = nn.Sequential(
            nn.Linear(obs_size, 8),
            nn.ReLU(),
        )

        self.out_net = nn.Sequential(
            nn.Linear(8 + act_size, 6),
            nn.ReLU(),
            nn.Linear(6, 1)
        )

    def forward(self, x, a):
        obs = self.obs_net(x)
        return self.out_net(torch.cat([obs, a], dim=1))
    
class TargetNet:
    """
    Wrapper around model which provides copy of it instead of trained weights
    """
    def __init__(self, model):
        self.model = model
        self.target_model = copy.deepcopy(model)

    def sync(self):
        self.target_model.load_state_dict(self.model.state_dict())

    def alpha_sync(self, alpha):
        """
        Blend params of target net with params from the model
        :param alpha:
        """
        assert isinstance(alpha, float)
        assert 0.0 < alpha <= 1.0
        state = self.model.state_dict()
        tgt_state = self.target_model.state_dict()
        for k, v in state.items():
            tgt_state[k] = tgt_state[k] * alpha + (1 - alpha) * v
        self.target_model.load_state_dict(tgt_state)
        
    
class Batch:
    def __init__(self):
        self.batch = []
        
    def append_sample(self, sample):
        self.batch.append(sample)
        
    def sample(self, sample_size):
        s, a, r, s_next = [],[],[],[]
        rand_sample = random.sample(self.batch[i], BATCH_SIZE)
        for values in rand_sample:
            s.append(values[0])
            a.append(values[1])
            r.append(values[2])
            s_next.append(values[3])
        return s, a, r, s_next
    
    def length(self):
         return len(self.batch)
        
    
class RL_Agents:
    def __init__(self, observation_spaces = None, action_spaces = None):
        self.device = "cpu"
        self.epsilon = 0.3
        self.n_buildings = len(observation_spaces)
        self.batch = {}
        for i in range(len(observation_spaces)):
            self.batch[i] = Batch()
            
        LEARNING_RATE_ACTOR = 1e-4
        LEARNING_RATE_CRITIC = 1e-3
        MIN_REPLAY_MEMORY = 100
        BATCH_SIZE = 50000
        EPOCHS = 6
        
        i = 0
        self.act_net, self.crt_net, self.tgt_act_net, self.tgt_crt_net, self.act_opt, self.crt_opt = {}, {}, {}, {}, {}, {}
        for o, a in zip(observation_spaces, action_spaces):
            self.act_net[i] = DDPGActor(o.shape[0], a.shape[0]).to(self.device)
            self.crt_net[i] = DDPGCritic(o.shape[0], a.shape[0]).to(self.device)
            self.tgt_act_net[i] = TargetNet(self.act_net[i])
            self.tgt_crt_net[i] = TargetNet(self.crt_net[i])
            self.act_opt[i] = optim.Adam(self.act_net[i].parameters(), lr=LEARNING_RATE_ACTOR)
            self.crt_opt[i] = optim.Adam(self.crt_net[i].parameters(), lr=LEARNING_RATE_CRITIC)
            i += 1
        
    def select_action(self, states):
        i, actions = 0, []
        for state in states:
            a = self.act_net[i](torch.tensor(state))
            print(a)
            a = (1+self.epsilon) * np.random.normal(size=a.shape)
            print(a)
            a = np.clip(a, -1, 1)
            actions.append(a)
            i += 1
        return actions
    
    def add_to_batch(self, states, actions, rewards, next_states):
        i = 0
        for s, a, r, s_next in zip(states, actions, rewards, next_states):
            self.batch[i].append_sample((s, a, r, s_next))
            i += 1
            
        batch, states_v, actions_v, rewards_v, dones_mask, last_states_v, q_v, last_act_v, q_last_v, q_ref_v, critic_loss_v, cur_actions_v, actor_loss_v = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}  
        if self.batch[i].length() > MIN_REPLAY_MEMORY:
            for i in range(self.n_buildings):
                for k in range(EPOCHS):
                    states_v[i], actions_v[i], rewards_v[i], states_next_v[i] = self.batch[i].sample(BATCH_SIZE)

                    # TRAIN CRITIC
                    crt_opt[i].zero_grad()
                    #Obtaining Q' using critic net with parameters teta_Q'
                    q_v[i] = self.crt_net[i](states_v[i], actions_v[i])

                    #Obtaining estimated optimal actions a|teta_mu from target actor net and from s_i+1.
                    last_act_v[i] = self.tgt_act_net[i].target_model(last_states_v[i]) #<----- Actor to train Critic

                    #Obtaining Q'(s_i+1, a|teta_mu) from critic net Q'
                    q_last_v[i] = self.tgt_crt_net[i].target_model(last_states_v[i], last_act_v[i])
                    q_last_v[i][dones_mask[i]] = 0.0

                    #Q_target used to train critic net Q'
                    q_ref_v[i] = rewards_v[i].unsqueeze(dim=-1) + q_last_v[i] * GAMMA
                    critic_loss_v[i] = F.mse_loss(q_v[i], q_ref_v[i].detach())
                    critic_loss_v[i].backward()
                    crt_opt[i].step()

                    # TRAIN ACTOR
                    act_opt[i].zero_grad()
                    #Obtaining estimated optimal current actions a|teta_mu from actor net and from s_i
                    cur_actions_v[i] = self.act_net[i](states_v[i])

                    #Actor loss = mean{ -Q_i'(s_i, a|teta_mu) }
                    actor_loss_v[i] = -self.crt_net[i](states_v[i], cur_actions_v[i]) #<----- Critic to train Actor
                    actor_loss_v[i] = actor_loss_v[i].mean()
                    #Find gradient of the loss and backpropagate to perform the updates of teta_mu
                    actor_loss_v[i].backward()
                    act_opt[i].step()

                    
                    self.tgt_act_net[i].alpha_sync(alpha=1 - 0.1)
                    self.tgt_crt_net[i].alpha_sync(alpha=1 - 0.1)

In [None]:
from agent import RL_Agents
from reward_function import reward_function
agent = RL_Agents()
episodes = 1
for _ in range(episodes):
    state = env.reset()
    done = False
    while not done:
        action = agents.select_action(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward_function(rewards)
        agents.add_to_batch(state, action, reward, next_state)
        state = next_state

In [None]:
class DDPGActor(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGActor, self).__init__()

        self.net = nn.Sequential(
            nn.Linear(obs_size, 4),
            nn.ReLU(),
            nn.Linear(4, 4),
            nn.ReLU(),
            nn.Linear(4, act_size),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)

class DDPGCritic(nn.Module):
    def __init__(self, obs_size, act_size):
        super(DDPGCritic, self).__init__()

        self.obs_net = nn.Sequential(
            nn.Linear(obs_size, 8),
            nn.ReLU(),
        )

        self.out_net = nn.Sequential(
            nn.Linear(8 + act_size, 6),
            nn.ReLU(),
            nn.Linear(6, 1)
        )

    def forward(self, x, a):
        obs = self.obs_net(x)
        return self.out_net(torch.cat([obs, a], dim=1))

In [None]:
for a, b, c, d in zip([1,2,3],[1,2,3],[1,2,3],[1,2,3]):
    print(a,b,c,d)

In [None]:
env.reset()

In [None]:
a = [[i] for i in range(1,11)]
a, r, terminal, _ = env.step(a)

In [None]:
import random
random.sample([(1,2),(3,4),(5,6),(7,8)],3)

In [None]:
reward_function(r)

In [None]:
if __name__ == "__main__":
    N_AGENTS = 2
    GAMMA = 0.99
    BATCH_SIZE = 5000
    LEARNING_RATE_ACTOR = 1e-4
    LEARNING_RATE_CRITIC = 1e-3
    REPLAY_SIZE = 5000
    REPLAY_INITIAL = 100
    TEST_ITERS = 120
    EPSILON_DECAY_LAST_FRAME = 1000
    EPSILON_START = 1.2
    EPSILON_FINAL = 0.02

    device = torch.device("cpu")

    act_net, crt_net, tgt_act_net, tgt_crt_net, agent, exp_source, buffer, act_opt, crt_opt, frame_idx = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    rew_last_1000, rew, track_loss_critic, track_loss_actor = {}, {}, {}, {}
#     for uid in buildings:
#         env[uid].reset()
    for uid in building_ids:
        #Create as many actor and critic nets as number of agents
        #Actor: states_agent_i -> actions_agent_i
        act_net[uid] = DDPGActor(buildings[uid].observation_spaces.shape[0], buildings[uid].action_spaces.shape[0]).to(device)

        #Critic: states_all_agents + actions_all_agents -> Q-value_agent_i [1]
        crt_net[uid] = DDPGCritic(buildings[uid].observation_spaces.shape[0], buildings[uid].action_spaces.shape[0]).to(device)

        tgt_act_net[uid] = ptan.agent.TargetNet(act_net[uid])
        tgt_crt_net[uid] = ptan.agent.TargetNet(crt_net[uid])

        agent[uid] = model.AgentD4PG(act_net[uid], device=device)
        exp_source[uid] = ptan.experience.ExperienceSourceFirstLast(env[uid], agent[uid], gamma=GAMMA, steps_count=1)
        buffer[uid] = ptan.experience.ExperienceReplayBuffer(exp_source[uid], buffer_size=REPLAY_SIZE)
        act_opt[uid] = optim.Adam(act_net[uid].parameters(), lr=LEARNING_RATE_ACTOR)
        crt_opt[uid] = optim.Adam(crt_net[uid].parameters(), lr=LEARNING_RATE_CRITIC)

        frame_idx[uid] = 0

        rew_last_1000[uid], rew[uid], track_loss_critic[uid], track_loss_actor[uid] = [], [], [], []

    batch, states_v, actions_v, rewards_v, dones_mask, last_states_v, q_v, last_act_v, q_last_v, q_ref_v, critic_loss_v, cur_actions_v, actor_loss_v = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
    cost, price_list, buffer_reward = {},{},{}
    for uid in buildings:
        cost[uid] = []
        price_list[uid] = []
        buffer_reward[uid] = []
    while not env[building_ids[-1]]._terminal():
        if frame_idx[4]%100 == 0:
            print(frame_idx[uid])
        for uid in buildings:
#             print(env[uid].time_step)
            agent[uid].epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx[uid] / EPSILON_DECAY_LAST_FRAME)
            frame_idx[uid] += 1           
            buffer[uid].populate(1)
#             print(buffer[uid].buffer[-1])
#             print(env[uid].buildings[uid].time_step)
            
        
        price = env[uid].total_electric_consumption[-1]*3e-5 + 0.045
        price_list[uid].append(price)
        for uid in buildings:  
            buffer_reward[uid].append(buffer[uid].buffer[-1].reward)
            electricity_cost = buffer[uid].buffer[-1].reward*price
            cost[uid].append(-electricity_cost)
            buffer[uid].buffer[-1] = buffer[uid].buffer[-1]._replace(reward=electricity_cost)

        if len(buffer[uid]) < REPLAY_INITIAL:
            continue   

        for uid in buildings:
            for k in range(6):
                batch[uid] = buffer[uid].sample(BATCH_SIZE)
                states_v[uid], actions_v[uid], rewards_v[uid], dones_mask[uid], last_states_v[uid] = common.unpack_batch_ddqn(batch[uid], device) 

                # TRAIN CRITIC
                crt_opt[uid].zero_grad()
                #Obtaining Q' using critic net with parameters teta_Q'
                q_v[uid] = crt_net[uid](states_v[uid], actions_v[uid])

                #Obtaining estimated optimal actions a|teta_mu from target actor net and from s_i+1.
                last_act_v[uid] = tgt_act_net[uid].target_model(last_states_v[uid]) #<----- Actor to train Critic

                #Obtaining Q'(s_i+1, a|teta_mu) from critic net Q'
                q_last_v[uid] = tgt_crt_net[uid].target_model(last_states_v[uid], last_act_v[uid])
                q_last_v[uid][dones_mask[uid]] = 0.0

                #Q_target used to train critic net Q'
                q_ref_v[uid] = rewards_v[uid].unsqueeze(dim=-1) + q_last_v[uid] * GAMMA
                critic_loss_v[uid] = F.mse_loss(q_v[uid], q_ref_v[uid].detach())
                critic_loss_v[uid].backward()
                crt_opt[uid].step()

                # TRAIN ACTOR
                act_opt[uid].zero_grad()
                #Obtaining estimated optimal current actions a|teta_mu from actor net and from s_i
                cur_actions_v[uid] = act_net[uid](states_v[uid])

                #Actor loss = mean{ -Q_i'(s_i, a|teta_mu) }
                actor_loss_v[uid] = -crt_net[uid](states_v[uid], cur_actions_v[uid]) #<----- Critic to train Actor
                actor_loss_v[uid] = actor_loss_v[uid].mean()
                #Find gradient of the loss and backpropagate to perform the updates of teta_mu
                actor_loss_v[uid].backward()
                act_opt[uid].step()

                if frame_idx[uid] % 1 == 0:
                    tgt_act_net[uid].alpha_sync(alpha=1 - 0.1)
                    tgt_crt_net[uid].alpha_sync(alpha=1 - 0.1)

In [None]:
from matplotlib.pyplot import figure

In [None]:
#Plotting all the individual actions
figure(figsize=(18, 6))
for uid in buildings:
    plt.plot(env[uid].action_track[uid][2300:2500])
plt.show()

In [None]:
from __future__ import absolute_import

from networks.actor import Actor
from networks.critic import Critic

from collections import deque
import tensorflow
import random


class DDPG(object):
    def __init__(self, state_size, action_size, actor_hidden_units=(300, 600),
                 actor_learning_rate=0.0001, critic_hidden_units=(300, 600),
                 critic_learning_rate=0.001, batch_size=64, discount=0.99,
                 memory_size=10000, tau=0.001):
        """
        Constructs a DDPG Agent with the given parameters
        :param state_size: Int denoting the world's state dimensionality
        :param action_size: Int denoting the world's action dimensionality
        :param actor_hidden_units: Tuple(Int) denoting the actor's hidden layer
            sizes. Each element in the tuple represents a layer in the Actor
            network and the Int denotes the number of neurons in the layer.
        :param actor_learning_rate: Float denoting the learning rate of the
            Actor network. Best to be some small number close to 0.
        :param critic_hidden_units: Tuple(Int) denoting the critic's hidden
            layer sizes. Each element in the tuple represents a layer in the
            Critic network and the Int denotes the number of neurons in the
            layer.
        :param critic_learning_rate: Float denoting the learning rate of the
            Critic network. Best to be some small number close to 0.
        :param batch_size: Int denoting the batch size for training.
        :param discount: Float denoting the discount (gamma) given to future
            potentioal rewards when calculating q values
        :param memory_size: Int denoting the number of State, action, rewards
            that the agent will remember
        :param tau:
        """

        self._discount = discount
        self._batch_size = batch_size
        self._memory_size = memory_size

        tensorflow_session = self._generate_tensorflow_session

        self._actor = Actor(tensorflow_session=tensorflow_session,
                            state_size=state_size, action_size=action_size,
                            hidden_units=actor_hidden_units,
                            learning_rate=actor_learning_rate,
                            batch_size=batch_size, tau=tau)

        self._critic = Critic(tensorflow_session=tensorflow_session,
                              state_size=state_size, action_size=action_size,
                              hidden_units=critic_hidden_units,
                              learning_rate=critic_learning_rate,
                              batch_size=batch_size, tau=tau)

        self._memory = deque()

    def _generate_tensorflow_session(self):
        """
        Generates and returns the tensorflow session
        :return: the Tensorflow Session
        """
        config = tensorflow.ConfigProto()
        config.gpu_options.allow_growth = True
        return tensorflow.Session(config=config)

    def get_action(self, state):
        """
        Returns the best action predicted by the agent given the current state.
        :param state: numpy array denoting the current state.
        :return: numpy array denoting the predicted action.
        """
        return self._actor._model.predict(state)

    def train(self):
        """
        Trains the DDPG Agent from it's current memory
        Please note that the agent must have gone through more steps than the
        specified batch size before this method will do anything
        :return: None
        """
        if len(self._memory) > self._batch_size:
            self._train()

    def _train(self):
        """
        Helper method for train. Takes care of sampling, and training and
        updating both the actor and critic networks
        :return: None
        """
        states, actions, rewards, done, next_states = self._get_sample()
        self._train_critic(states, actions, next_states, done, rewards)
        self._train_actor(states)
        self._update_target_models()

    def _get_sample(self):
        """
        Finds a random sample of size self._batch_size from the agent's current
        memory.
        :return: Tuple(List(Float, Boolean))) denoting the sample of states,
            actions, rewards, done, and next states.
        """
        sample = random.sample(self._memory, self._batch_size)
        states, actions, rewards, done, next_states = zip(*sample)
        return states, actions, rewards, done, next_states

    def _train_critic(self, states, actions, next_states, done, rewards):
        """
        Trains the critic network
        C(s, a) -> q
        :param states: List of the states to train the network with
        :param actions: List of the actions to train the network with
        :param next_states: List of the t+1 states to train the network with
        :param rewards: List of rewards to calculate q_targets.
        :return: None
        """
        q_targets = self._get_q_targets(next_states, done, rewards)
        self._critic.train(states, actions, q_targets)

    def _get_q_targets(self, next_states, done, rewards):
        """
        Calculates the q targets with the following formula
        q = r + gamma * next_q
        unless there is no next state in which
        q = r
        :param next_states: List(List(Float)) Denoting the t+1 state
        :param done: List(Bool) denoting whether each step was an exit step
        :param rewards: List(Float) Denoting the reward given in each step
        :return: The q targets
        """
        next_actions = self._actor._model.predict(next_states)
        next_q_values = self._critic._target_model.predict(next_states,
                                                           next_actions)
        q_targets = [reward if this_done else reward + self._discount *
                                                       (next_q_value)
                     for (reward, next_q_value, this_done)
                     in zip(rewards, next_q_values, done)]
        return q_targets

    def _train_actor(self, states):
        """
        Trains the actor network using the calculated deterministic policy
            gradients.
        :param states: List(List(Float)) denoting he states to train the Actor
            on
        :return: None
        """
        gradients = self._get_gradients(states)
        self._actor.train(states, gradients)

    def _get_gradients(self, states):
        """
        Calculates the Deterministic Policy Gradient for Actor training
        :param states: The states to calculate the gradients for.
        :return:
        """
        action_for_gradients = self._actor._model.predict(states)
        self._critic.get_gradients(states, action_for_gradients)

        # todo finish this.

    def _update_target_models(self):
        """
        Updates the target models to slowly track the main models
        :return: None
        """
        self._critic.train_target_model()
        self._actor.train_target_model()

    def remember(self, state, action, reward, done, next_state):
        """
        Stores the given state, action, reward etc in the Agent's memory.
        :param state: The state to remember
        :param action: The action to remember
        :param reward: The reward to remember
        :param done: Whether this was a final state
        :param next_state: The next state (if applicable)
        :return: None
        """
        self._memory.append((state, action, reward, done, next_state))
        if len(self._memory) > self._memory_size:
            self._memory.popleft()