In [2]:
import numpy as np

In [3]:
#model environment parameters
adjustment_interval_min = 15
charging_duration_min = 120
min_charging_rate_kW = 0
max_charging_rate_kW = 22
battery_capacity_kWh = 40

expected_usage_kWh = 30
sigma_kWh = 5

#optimizer (Reinforcement Learner) parameters
fully_exhousted_penalty = 0.99

#computed parameters
num_charging_adjustments = int(charging_duration_min/adjustment_interval_min)

#series
Intervals = np.linspace(0, num_charging_adjustments - 1, num_charging_adjustments).astype(int)
Actions = np.linspace(min_charging_rate_kW, max_charging_rate_kW, max_charging_rate_kW + 1).astype(int)
P = np.zeros((num_charging_adjustments))

A = np.ones(num_charging_adjustments)

print(Intervals, Actions, P, A)

[0 1 2 3 4 5 6 7] [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22] [0. 0. 0. 0. 0. 0. 0. 0.] [1. 1. 1. 1. 1. 1. 1. 1.]


In [4]:
samples = np.random.normal(expected_usage_kWh, sigma_kWh, 10)

print(samples)

[29.5082227  39.9797153  37.34870601 27.15515446 23.54580005 23.73064905
 30.20621349 21.87115244 27.22455015 24.79985509]


In [81]:
def calcChargingCost(t, p):
    e = np.exp(1)
    return A[t]*(e**p)

def reward(totalChargeByInterval, requiredkWh):
    totalCharged = totalChargeByInterval/(int(60/adjustment_interval_min))
    # diviation penalty
    divitation = abs(30 - totalCharged)
    divitationCost = max(divitation, .001)
    # extra penalty for running out
    runOutCost = -1000 if requiredkWh > totalCharged else 0
    return  divitationCost + runOutCost

In [6]:
print(reward([11,11,11, 11, 22, 10, 10, 9], 30))


Charged 23.75
(6.25, 3585204498.713966, 1000)


In [96]:
#generate env

def convertActionToChargingPower(value):
    if value == 0:
        return 0
    if value == 1:
        return 6
    if value == 2:
        return 12
    if value == 3:
        return 18

class env:
    def __init__(self):
        self.nS = (max_charging_rate_kW + 1) * num_charging_adjustments
        self.nA = 4
        self.P = self.create_P()
        print(self.nS)
        print(self.nA)
        print(self.P)

    def create_P(self):

        numberOfActions = self.nA

        def totalNodes(layer):
            totalNodes = 0
            for lvl in range(0, layer):
                totalNodes = totalNodes + numberOfActions**lvl
            return totalNodes
        
        def getActionValues(level, levelId):
            actionValue = levelId%numberOfActions
            values = [actionValue]
            ids = [levelId]
            for lvl in range(0, level):
                lastActionId = ids[0]//numberOfActions
                lastActionValue = lastActionId%numberOfActions
                values.insert(0, lastActionValue)
                ids.insert(0, lastActionId)
            return values

        actions = list()

        states = list()

        print("Actions", totalNodes(num_charging_adjustments))

        for actionLevel in range(0, num_charging_adjustments):
            
            lastLayerActions = totalNodes(actionLevel)
            layerActions = totalNodes(actionLevel+1)

            states.append([])

            for levelActionId in range(0, numberOfActions**actionLevel):
                # levelActionId:
                #0 0 0 0
                #  1 1 1
                #    2 2
                #      3
                #
                # globalIds:
                #0 1 3 6
                #  2 4 7
                #    5 8
                #      9
                #
                action = levelActionId%numberOfActions
                globalActionId = (lastLayerActions-1) + levelActionId
                actions.append(0)

                originStateLevelId = levelActionId//numberOfActions
                originStateId = originStateLevelId+totalNodes(actionLevel-1)
                destinationStateId = totalNodes(actionLevel)+(originStateLevelId*numberOfActions)+action

                if(actionLevel != num_charging_adjustments - 1):
                    actions[globalActionId] = [ 1, destinationStateId, -calcChargingCost(actionLevel, action), False]

                else:
                    actionValues = getActionValues(actionLevel, levelActionId)
                    charge = 0
                    for i, action in enumerate(actionValues):
                        charge = charge + convertActionToChargingPower(action)
                    actions[globalActionId] = [1, -1, reward(charge, 10)-calcChargingCost(actionLevel, convertActionToChargingPower(action)), False]
        
        return actions

env()

Actions 21845
184
4
[[0, '|', 1, -1.0, False], [0, '|', 2, -2.718281828459045, False], [0, '|', 3, -7.3890560989306495, False], [0, '|', 4, -20.085536923187664, False], [1, '|', 5, -1.0, False], [1, '|', 6, -2.718281828459045, False], [1, '|', 7, -7.3890560989306495, False], [1, '|', 8, -20.085536923187664, False], [2, '|', 9, -1.0, False], [2, '|', 10, -2.718281828459045, False], [2, '|', 11, -7.3890560989306495, False], [2, '|', 12, -20.085536923187664, False], [3, '|', 13, -1.0, False], [3, '|', 14, -2.718281828459045, False], [3, '|', 15, -7.3890560989306495, False], [3, '|', 16, -20.085536923187664, False], [4, '|', 17, -1.0, False], [4, '|', 18, -2.718281828459045, False], [4, '|', 19, -7.3890560989306495, False], [4, '|', 20, -20.085536923187664, False], [5, '|', 21, -1.0, False], [5, '|', 22, -2.718281828459045, False], [5, '|', 23, -7.3890560989306495, False], [5, '|', 24, -20.085536923187664, False], [6, '|', 25, -1.0, False], [6, '|', 26, -2.718281828459045, False], [6, '|',

<__main__.env at 0x13d272af0>

In [8]:
def value_iteration(env, theta=0.0001, discount_factor=1.0):
    """
    Value Iteration Algorithm.
    
    Args:
        env: OpenAI env. env.P represents the transition probabilities of the environment.
            env.P[s][a] is a list of transition tuples (prob, next_state, reward, done).
            env.nS is a number of states in the environment. 
            env.nA is a number of actions in the environment.
        theta: We stop evaluation once our value function change is less than theta for all states.
        discount_factor: Gamma discount factor.
        
    Returns:
        A tuple (policy, V) of the optimal policy and the optimal value function.
    """
    
    def one_step_lookahead(state, V):
        """
        Helper function to calculate the value for all action in a given state.
        
        Args:
            state: The state to consider (int)
            V: The value to use as an estimator, Vector of length env.nS
        
        Returns:
            A vector of length env.nA containing the expected value of each action.
        """
        A = np.zeros(env.nA)
        for a in range(env.nA):
            for prob, next_state, reward, done in env.P[state][a]:
                A[a] += prob * (reward + discount_factor * V[next_state])
        return A
    
    V = np.zeros(env.nS)
    while True:
        # Stopping condition
        delta = 0
        # Update each state...
        for s in range(env.nS):
            # Do a one-step lookahead to find the best action
            A = one_step_lookahead(s, V)
            best_action_value = np.max(A)
            # Calculate delta across all states seen so far
            delta = max(delta, np.abs(best_action_value - V[s]))
            # Update the value function. Ref: Sutton book eq. 4.10. 
            V[s] = best_action_value        
        # Check if we can stop 
        if delta < theta:
            break
    
    # Create a deterministic policy using the optimal value function
    policy = np.zeros([env.nS, env.nA])
    for s in range(env.nS):
        # One step lookahead to find the best action for this state
        A = one_step_lookahead(s, V)
        best_action = np.argmax(A)
        # Always take the best action
        policy[s, best_action] = 1.0
    
    return policy, V