This is the Smart Factory Exercise.

In [2]:
import mdptoolbox, mdptoolbox.example
import numpy as np
import itertools

Define all items, states and actions of our model. <br>
* We have three different items (WHITE, BLUE, RED)
* We have four different possible states of each warehouse field (EMPTY, WHITE, BLUE, RED)
* We have six possible actions for our agent (STORE and RESTORE in combination with each item color)

In [3]:
def createWarehouseFields(length, height):
    warehouseFields = []
    for i in range(0, length):
        for j in range(0, height):
            warehouseFields.append((i,j))
    return warehouseFields

n = 2
warehouseFields = createWarehouseFields(n, n)

actions = warehouseFields.copy()


items = ['WHITE', 'BLUE', 'RED']
operations = ['STORE', 'RESTORE']
warehouseState = ['WHITE', 'BLUE', 'RED', 'EMPTY']
operationsWithItems = []
for operation in operations:
    for item in items:
        operationsWithItems.append((operation, item))


# warehouse size n x n (in our case n = 2)

print(items)
print(warehouseState)
print(actions)
print(operationsWithItems)

['WHITE', 'BLUE', 'RED']
['WHITE', 'BLUE', 'RED', 'EMPTY']
[(0, 0), (0, 1), (1, 0), (1, 1)]
[('STORE', 'WHITE'), ('STORE', 'BLUE'), ('STORE', 'RED'), ('RESTORE', 'WHITE'), ('RESTORE', 'BLUE'), ('RESTORE', 'RED')]


Create all fields of the warehouse of size lenght x heigth (in our case 2 x 2)

[(0, 0), (0, 1), (1, 0), (1, 1)]


Next create all possible states of our warehouse. 
In our case we have 4 fields with 4 different states each (EMPTY, WHITE, BLUE, RED), which results in 4^4 states.

In [49]:
def getStates(warehouseFields, fieldStatus):  
    warehouseStates = itertools.product(fieldStatus, repeat=len(warehouseFields))
    states = []
    for state in warehouseStates:
      states.append(list(state))
    statesWithOperations = []
    for operation in operationsWithItems:
      for state in states:
        newCompleteState = state.copy()
        newCompleteState = [operation] + newCompleteState
        statesWithOperations.append(newCompleteState)
    return statesWithOperations

iterStates = getStates(warehouseFields, warehouseState)
states = []
for state in iterStates:
  states.append(list(state))

print(len(states))
print(states)

, [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'BLUE', 'RED'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'BLUE', 'EMPTY'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'RED', 'WHITE'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'RED', 'BLUE'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'RED', 'RED'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'RED', 'EMPTY'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'EMPTY', 'WHITE'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'EMPTY', 'BLUE'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'EMPTY', 'RED'], [('RESTORE', 'BLUE'), 'RED', 'BLUE', 'EMPTY', 'EMPTY'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'WHITE', 'WHITE'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'WHITE', 'BLUE'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'WHITE', 'RED'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'WHITE', 'EMPTY'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'BLUE', 'WHITE'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'BLUE', 'BLUE'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'BLUE', 'RED'], [('RESTORE', 'BLUE'), 'RED', 'RED', 'BLUE', 'EMPTY'], [('RESTORE', 'BLUE'), 'RED

Create a reward which fits our problem. <br>
The reward is higher if the distance our agent has to cover is lower.

In [5]:
# read statistics from file to calculate better rewards
# i = 0: White, i = 1: Blue, i = 2: Red
countItems = np.zeros((3))
itemsTotal = 0

warehouseorder = open('Exercise4_warehousetraining2x2.txt')
for line in warehouseorder:
    curAction = line.split('\t')
    # curOperation = curAction[0].upper()
    curItem = curAction[1].strip('\n').upper()
    if curItem == 'WHITE':
        countItems[0] += 1
    if curItem == 'BLUE':
        countItems[1] += 1
    if curItem == 'RED':
        countItems[2] += 1
    itemsTotal += 1

# i = 0: White, i = 1: Blue, i = 2: Red
probsItems = countItems / itemsTotal
print(probsItems)

# reward function based on distance and item probability:
def getRewardBasedOnDistanceAndProbability(fieldIndex, item):
    x = warehouseFields[fieldIndex]
    distance = x[0] + x[1] + 1
    prob = 0
    if item == 'WHITE':
        prob = probsItems[0]
    if item == 'BLUE':
        prob = probsItems[1]
    if item == 'RED':
        prob = probsItems[2]
    reward = 1/distance * 1/distance * prob
    return reward


[0.25180384 0.24336554 0.50483062]


Bring it all together now and create the transition and the reward matrix. <br>

In [67]:
# create Transition and reward matrix
def createTransitionAndRewardMatrix(actions, states):
    T = np.zeros((len(actions), len(states), len(states)))
    R = np.zeros((len(states), len(actions)))

    for i in range(len(actions)):
        # current action which is the field we take
        action = actions[i]
        for j in range(len(states)):
            curState = states[j]  
            
            operation = curState[0]
            curWarehouseState = curState[1:].copy()
            # Do this for each operation
            if operation[0] == 'STORE':
                if curWarehouseState[i] != 'EMPTY':
                    for k in range(0, 6):
                        nextOperation = states[(j + 256*k) % len(states)][0]
                        T[i, j, (j + k * 256) % len(states)] =  getTransitionProbabiltiy(nextOperation)
                else:

                    nextWarehouseState = curWarehouseState
                    nextWarehouseState[i] = operation[1]
                    nextIndexStart = states.index([operation] + nextWarehouseState)
                    transitionProbability = 1 / 6
                    for k in range(0, 6):
                        nextOperation = states[(nextIndexStart + 256*k) % len(states)][0]
                        T[i, j, (nextIndexStart + 256*k) % len(states)] =  getTransitionProbabiltiy(nextOperation)
                        rewardTest = [3,2,2,1]
                        R[(nextIndexStart + 256*k) % len(states), i] = rewardTest[i]

            else:
                if operation[1] != curWarehouseState[i]:
                    for k in range(0, 6):
                        nextOperation = states[(j + 256*k) % len(states)][0]
                        T[i, j, (j+256*k) % len(states)] = getTransitionProbabiltiy(nextOperation)
                else:
                    nextWarehouseState = curWarehouseState
                    nextWarehouseState[i] = 'EMPTY'
                    nextIndexStart = states.index([operation] + nextWarehouseState)
                    transitionProbability = 1 / 6
                    for k in range(0, 6):
                        nextOperation = states[(nextIndexStart + 256*k) % len(states)][0]
                        T[i, j, (nextIndexStart + 256*k) % len(states)] = getTransitionProbabiltiy(nextOperation)
                        rewardTest = [3,2,2,1]
                        R[(nextIndexStart + 256*k) % len(states), i] = rewardTest[i]
    return T, R

def getTransitionProbabiltiy(operation):
    probability = 0
    if operation == ('STORE', 'WHITE'):
        probability = 0.1
    elif operation == ('STORE', 'BLUE'):
        probability = 0.1
    elif operation == ('STORE', 'RED'):
        probability = 0.3
    elif operation == ('RESTORE', 'WHITE'):
        probability = 0.1
    elif operation == ('RESTORE', 'BLUE'):
        probability = 0.1
    else:
        probability = 0.3
    return probability


T, R = createTransitionAndRewardMatrix(actions, states)
print(np.shape(T))
test = np.sum(T[0][0])
print(T[0][0])
print(test)

(4, 1536, 1536)
[0.1 0.  0.  ... 0.  0.  0. ]
1.0


Finally create the mdp models and evaluate the different classes

In [68]:
# 1. Policy Iteration
mdpWarehousePolicy = mdptoolbox.mdp.PolicyIteration(T, R, 0.1, max_iter=100)
# Run the MDP
mdpWarehousePolicy.run()

print('PolicyIteration:')
print(mdpWarehousePolicy.policy)
#print(mdpWarehousePolicy.V)
print(mdpWarehousePolicy.iter)

PolicyIteration:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [69]:
# 2. QLearning
mdpWarehouseQ = mdptoolbox.mdp.QLearning(T, R, 0.1)
# Run the MDP
mdpWarehouseQ.run()

print('Q learning:')
print(mdpWarehouseQ.policy[0])
print(mdpWarehouseQ.V[0])

Q learning:
3
-1.307742003341208


In [71]:
# 3. ValueIteration
mdpWarehouseValueIter = mdptoolbox.mdp.ValueIteration(T, R, 0.1)
# Run the MDP
mdpWarehouseValueIter.run()

print('Value Iteration:')
print(mdpWarehouseValueIter.policy)
print(mdpWarehouseValueIter.V)
print(mdpWarehouseValueIter.iter)

OverflowError: cannot convert float infinity to integer

Now we evaluate the models and compare the needed steps for each model and also compare them to a greedy approach.

In [70]:
stepsTaken = [1, 2, 2, 3]

# get all test actions form the test file in a list
def getTestActions(warehouseorder):
    actionList = []
    for line in warehouseorder:
        split = line.split('\t')
        curOperation = split[0].upper()
        curItem = split[1].strip('\n').upper()
        curAction = (curOperation, curItem)
        actionList.append(curAction)
    return actionList

# evaluate a greedy approach, always store/restore at the nearest possible field
def greedyStorage(actionList):
    curState = ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']
    steps = 0

    for (operation, item) in actionList:
        # case operation not possible
        if ((operation == 'STORE') and ('EMPTY' not in curState)) \
            or ((operation == 'RESTORE') and (item not in curState)):
            print("does this happen?") # => with our test data this is never the case
        else:
            if operation == 'STORE':
                # store in the nearest field
                for i in range(0, len(curState)):
                    if curState[i] == 'EMPTY':
                        curState[i] = item
                        steps += stepsTaken[i]
                        break
            # restore operation
            else:
                for i in range(0, len(curState)):
                    if curState[i] == item:
                        curState[i] = 'EMPTY'
                        steps += stepsTaken[i]
                        break
    return steps    

print(actions)
warehouseorder = open('Exercise4_warehousetraining2x2.txt')
actionList = getTestActions(warehouseorder)
greedySteps = greedyStorage(actionList)
print(greedySteps)

[(0, 0), (0, 1), (1, 0), (1, 1)]
14401


In [103]:
# evaluate how much steps our mdp models will take:
def evaluateMDPModel(mdpModel, actions, actionList):
    steps = 0
    policy = mdpModel.policy
    V = mdpModel.V

    curState = ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']
    for (operation, item) in actionList:
        # case operation not possible
        if ((operation == 'STORE') and ('EMPTY' not in curState)) \
            or ((operation == 'RESTORE') and (item not in curState)):
            print("does this happen?") # => with our test data this is never the case
        else:
            # 1. get next state based on value of the field?
            curActionIndex = actions.index((operation, item))
            indexCurState = states.index(curState)
            curTransitionLine = T[curActionIndex, indexCurState]
            curPossibleSteps = [i for i, value in enumerate(curTransitionLine) if value != 0]
            valuesOfTheField = []
            for posInd in curPossibleSteps:
                valuesOfTheField.append(V[posInd])
            max_value = max(valuesOfTheField)
            max_index = curPossibleSteps[valuesOfTheField.index(max_value)]
            # get next state
            nextState = states[max_index].copy()
            # evaluate steps, therefore get index of change:
            for index, (first, second) in enumerate(zip(curState, nextState)):
                if first != second:
                    steps += stepsTaken[index]
            # set cur state to next state:
            curState = nextState           
    return steps    

stepsTaken = [1, 2, 2, 3]

print("Greedy approach steps:" + str(greedySteps))

policyIterationSteps = evaluateMDPModel(mdpWarehousePolicy, actions, actionList)
print("Policy iteration steps:" + str(policyIterationSteps))

valueIterationSteps = evaluateMDPModel(mdpWarehouseValueIter, actions, actionList)
print("Value iteration steps:" + str(valueIterationSteps))

QlearningSteps = evaluateMDPModel(mdpWarehouseQ, actions, actionList)
print("Q learning steps:" + str(QlearningSteps))

Greedy approach steps:14401


ValueError: ('STORE', 'RED') is not in list