This is the Smart Factory Exercise.

In [12]:
import mdptoolbox, mdptoolbox.example
import numpy as np
import itertools

Define all items, states and actions of our model. <br>
* We have three different items (WHITE, BLUE, RED)
* We have four different possible states of each warehouse field (EMPTY, WHITE, BLUE, RED)
* We have six possible actions for our agent (STORE and RESTORE in combination with each item color)

In [13]:
items = ['WHITE', 'BLUE', 'RED']
fieldStatus = ['WHITE', 'BLUE', 'RED', 'EMPTY']
operations = ['STORE', 'RESTORE']
actions = []
for operation in operations:
    for item in items:
        actions.append((operation, item))

# warehouse size n x n (in our case n = 2)
n = 2
print(items)
print(fieldStatus)
print(actions)

['WHITE', 'BLUE', 'RED']
['WHITE', 'BLUE', 'RED', 'EMPTY']
[('STORE', 'WHITE'), ('STORE', 'BLUE'), ('STORE', 'RED'), ('RESTORE', 'WHITE'), ('RESTORE', 'BLUE'), ('RESTORE', 'RED')]


Create all fields of the warehouse of size lenght x heigth (in our case 2 x 2)

In [14]:
def createWarehouseFields(length, height):
    warehouseFields = []
    for i in range(0, length):
        for j in range(0, height):
            warehouseFields.append((i,j))
    return warehouseFields

warehouseFields = createWarehouseFields(n, n)
print(warehouseFields)

[(0, 0), (0, 1), (1, 0), (1, 1)]


Next create all possible states of our warehouse. 
In our case we have 4 fields with 4 different states each (EMPTY, WHITE, BLUE, RED), which results in 4^4 states.

In [15]:
def getStates(warehouseFields, fieldStatus):  
    return itertools.product(fieldStatus, repeat=len(warehouseFields))

iterStates = getStates(warehouseFields, fieldStatus)
states = []
for state in iterStates:
  states.append(list(state))

print(len(states))

256


Create a reward which fits our problem. <br>
The reward is higher if the distance our agent has to cover is lower.

In [16]:
# read statistics from file to calculate better rewards
# i = 0: White, i = 1: Blue, i = 2: Red
countItems = np.zeros((3))
itemsTotal = 0

warehouseorder = open('Exercise4_warehousetraining2x2.txt')
for line in warehouseorder:
    curAction = line.split('\t')
    # curOperation = curAction[0].upper()
    curItem = curAction[1].strip('\n').upper()
    if curItem == 'WHITE':
        countItems[0] += 1
    if curItem == 'BLUE':
        countItems[1] += 1
    if curItem == 'RED':
        countItems[2] += 1
    itemsTotal += 1

# i = 0: White, i = 1: Blue, i = 2: Red
probsItems = countItems / itemsTotal
print(probsItems)

# reward function based on distance and item probability:
def getRewardBasedOnDistanceAndProbability(fieldIndex, item):
    x = warehouseFields[fieldIndex]
    distance = x[0] + x[1] + 1
    prob = 0
    if item == 'WHITE':
        prob = probsItems[0]
    if item == 'BLUE':
        prob = probsItems[1]
    if item == 'RED':
        prob = probsItems[2]
    reward = 1/distance * 1/distance * prob
    return reward


# simple reward function:
# get the distance in field from 0,0 (0,0 has already distance 1)
# reward is the reciprocal**2 (to punish far away fields even more)
def getRewardBasedOnDistance(x):
    distance = x[0] + x[1] + 1
    reward = 1/distance * 1/distance
    return reward

rewardVector = [getRewardBasedOnDistance(x) for x in warehouseFields]
print(warehouseFields)
print(rewardVector)

[0.25180384 0.24336554 0.50483062]
[(0, 0), (0, 1), (1, 0), (1, 1)]
[1.0, 0.25, 0.25, 0.1111111111111111]


Bring it all together now and create the transition and the reward matrix. <br>

In [17]:
# create Transition and reward matrix
def createTransitionAndRewardMatrix(actions, states):
    T = np.zeros((len(actions), len(states), len(states)))
    R = np.zeros((len(states), len(actions)))

    for i in range(len(actions)):
        # current operation and item
        operation, item = actions[i]

        for j in range(len(states)):
            curState = states[j]

            # if an operation is not valid i.e. store when warehouse is full 
            # or restore when warehouse is empty, stay in the current state 
            if ((operation == 'STORE') and ('EMPTY' not in curState)) \
                or ((operation == 'RESTORE') and (item not in curState)):
                T[i, j, j] = 1                
            else:
                possibleFields = []
                for k in range(len(curState)):
                    # find empty fields where the agent can store the item
                    if operation == 'STORE':
                        if curState[k] == 'EMPTY':
                            possibleFields.append(k)
                    # find fields with the requested item
                    else:
                        if curState[k] == item:
                            possibleFields.append(k)
                # possible fields can not be empty because of the check above
                transitionProbability = 1 / len(possibleFields)
                # set probabiltiy in transition matrix for the possible nextStates
                for field in possibleFields:
                    nextState = curState.copy()
                    if operation == 'STORE':
                        nextState[field] = item
                    else:
                        nextState[field] = 'EMPTY'
                    nextIndex = states.index(nextState)
                    T[i, j, nextIndex] = transitionProbability
                    # set reward in R
                    # R[nextIndex][i] = rewardVector[field] # old simple reward
                    R[nextIndex][i] = getRewardBasedOnDistanceAndProbability(field, item)         
    return T, R

T, R = createTransitionAndRewardMatrix(actions, states)
print(np.shape(T))
test = np.sum(T)
print(test)

(6, 256, 256)
1536.0


Finally create the mdp models and evaluate the different classes

In [18]:
# 1. Policy Iteration
mdpWarehousePolicy = mdptoolbox.mdp.PolicyIteration(T, R, 0.3, max_iter=100)
# Run the MDP
mdpWarehousePolicy.run()

print('PolicyIteration:')
print(mdpWarehousePolicy.policy)
print(mdpWarehousePolicy.V)
print(mdpWarehousePolicy.iter)

PolicyIteration:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 0, 5, 5, 5, 0, 5, 5, 5, 2, 0, 0, 2, 0, 5, 5, 5, 0, 5, 5, 5, 1, 5, 5, 5, 2, 1, 1, 2, 1, 5, 5, 5, 2, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 2, 2, 0, 0, 2, 0, 1, 1, 2, 1, 2, 2, 2, 2, 5, 5, 2, 2)
(0.35971977148447737, 0.35971977148447737, 0.35971977148447737, 0.3597197714844773, 0.35971977148447737, 0.35971977148447737, 0.35971977148447737, 0.3597197714844773, 0.35971977148447737, 0.35971977148447737, 0.359

In [19]:
# 2. QLearning
mdpWarehouseQ = mdptoolbox.mdp.QLearning(T, R, 0.3)
# Run the MDP
mdpWarehouseQ.run()

print('Q learning:')
print(mdpWarehouseQ.policy)
print(mdpWarehouseQ.V)

Q learning:
(0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 4, 0, 0, 0, 0, 0, 0)
(0.3597197714844745, 0.3024774872072901, 0.07568415522103257, 0.14999156929217405, 0.34892323334620984, 0.18685098367363134, 0.2949030170887516, 0.01602673054393656, 0.3571521891017154, 0.044571171996877185, 0.330063102

In [20]:
# 3. RelativeValueIteration:
mdpWarehouseRelIter = mdptoolbox.mdp.RelativeValueIteration(T, R, 0.3)
# Run the MDP
mdpWarehouseRelIter.run()

print('RelativeValueIteration:')
print(mdpWarehouseRelIter.policy)
print(mdpWarehouseRelIter.V)
print(mdpWarehouseRelIter.iter)

RelativeValueIteration:
(0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 2, 2, 5, 5, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 2, 0, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2)
(0.5345481227834169, 0.5345481227834169, 0.5345481227834169, 0.5345481227834169, 0.5345481227834169, 0.5357710651828299, 0.5345481227834169, 0.5345481227834169, 0.5345481227834169, 0.5345481227834169, 0.5345

In [21]:
# 4. ValueIteration
mdpWarehouseValueIter = mdptoolbox.mdp.ValueIteration(T, R, 0.3)
# Run the MDP
mdpWarehouseValueIter.run()

print('Value Iteration:')
print(mdpWarehouseValueIter.policy)
print(mdpWarehouseValueIter.V)
print(mdpWarehouseValueIter.iter)

Value Iteration:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 5, 5, 0, 5, 5, 5, 0, 5, 5, 5, 2, 0, 0, 2, 0, 5, 5, 5, 0, 5, 5, 5, 1, 5, 5, 5, 2, 1, 1, 2, 1, 5, 5, 5, 2, 5, 5, 5, 2, 5, 5, 5, 2, 2, 2, 2, 2, 0, 0, 2, 0, 1, 1, 2, 1, 2, 2, 2, 2, 5, 5, 2, 2)
(0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.35000733765439646, 0.3