This is the Smart Factory Exercise.

In [73]:
import mdptoolbox, mdptoolbox.example
import numpy as np
import itertools

Define all items, states and actions of our model. <br>
* We have three different items (WHITE, BLUE, RED)
* We have four different possible states of each warehouse field (EMPTY, WHITE, BLUE, RED)
* We have two operations (STORE, RESTORE) which we can combine with each item
* Our robot can take n x n actions (It can chose one of the warehousefields)

In [74]:
def createWarehouseFields(length, height):
    warehouseFields = []
    for i in range(0, length):
        for j in range(0, height):
            warehouseFields.append((i,j))
    return warehouseFields

# warehouse size n x n (in our case n = 2)
n = 2
warehouseFields = createWarehouseFields(n, n)
actions = warehouseFields.copy()

items = ['WHITE', 'BLUE', 'RED']
operations = ['STORE', 'RESTORE']
warehouseState = ['WHITE', 'BLUE', 'RED', 'EMPTY']
operationsWithItems = []
for operation in operations:
    for item in items:
        operationsWithItems.append((operation, item))

print(actions)
print(operationsWithItems)

[(0, 0), (0, 1), (1, 0), (1, 1)]
[('STORE', 'WHITE'), ('STORE', 'BLUE'), ('STORE', 'RED'), ('RESTORE', 'WHITE'), ('RESTORE', 'BLUE'), ('RESTORE', 'RED')]


Next create all possible states of our warehouse, which are our operations with items (2x3) * each of the warehouse sates (nxn) ** Warehousestates.
This gives us 2*3*4^4= 1536 states-   

In [75]:
def getStates(warehouseFields, fieldStatus, operationsWithItems):  
    warehouseStates = itertools.product(fieldStatus, repeat=len(warehouseFields))
    states = []
    for state in warehouseStates:
      states.append(list(state))
    statesWithOperations = []
    for operation in operationsWithItems:
      for state in states:
        newCompleteState = state.copy()
        newCompleteState = [operation] + newCompleteState
        statesWithOperations.append(newCompleteState)
    return statesWithOperations

iterStates = getStates(warehouseFields, warehouseState, operationsWithItems)
# convert states to list
states = []
for state in iterStates:
  states.append(list(state))

print("Total amount of states:" + str(len(states)))

Total amount of states:1536


Read the information from the training file to get solid probabilites for each state transition. <br>
Create an reward function that fits our problem.

In [76]:
# read statistics from file to get a solid transition function
# i = 0: ('STORE', 'WHITE'), i = 1: ('STORE', 'BLUE'), i = 2: ('STORE', 'RED')
# i = 3: ('RESTORE', 'WHITE'), i = 4: ('RESTORE', 'BLUE'), i = 5: ('RESTORE', 'RED')
countItems = np.zeros((6))
itemsTotal = 0

warehouseorder = open('Exercise4_warehousetraining2x2.txt')
for line in warehouseorder:
    curAction = line.split('\t')
    curOperation = curAction[0].upper()
    curItem = curAction[1].strip('\n').upper()
    curOperationWithItem = (curOperation, curItem)
    if curOperationWithItem == ('STORE', 'WHITE'):
        countItems[0] += 1
    elif curOperationWithItem == ('STORE', 'BLUE'):
        countItems[1] += 1
    elif curOperationWithItem == ('STORE', 'RED'):
        countItems[2] += 1
    elif curOperationWithItem == ('RESTORE', 'WHITE'):
        countItems[3] += 1
    elif curOperationWithItem == ('RESTORE', 'BLUE'):
        countItems[4] += 1
    else:
        countItems[5] += 1
    itemsTotal += 1

operationsWithItemsProbabilites = countItems / itemsTotal
print(operationsWithItemsProbabilites)


# get probabilites based on the operation and operation x item probabilites
def getTransitionProbabiltiy(operation):
    if operation == ('STORE', 'WHITE'):
        return operationsWithItemsProbabilites[0]
    elif operation == ('STORE', 'BLUE'):
        return operationsWithItemsProbabilites[1]
    elif operation == ('STORE', 'RED'):
        return operationsWithItemsProbabilites[2]
    elif operation == ('RESTORE', 'WHITE'):
        return operationsWithItemsProbabilites[3]
    elif operation == ('RESTORE', 'BLUE'):
        return operationsWithItemsProbabilites[4]
    else:
        return operationsWithItemsProbabilites[5]

# function that calculates a simple reward for each field => better reward if the distance is low
def getSimpleReward(warehouseFields):
    reward = []
    for (x, y) in warehouseFields:
        distance = (x+y+1)
        curReward = 1/distance * 1/distance
        reward.append(curReward)
    return reward

[0.12596307 0.12168277 0.25241531 0.12584077 0.12168277 0.25241531]


Bring it all together now and create the transition and the reward matrix. <br>

In [77]:
# create Transition and reward matrix
def createTransitionAndRewardMatrix(actions, states, numberOperations, rewardFunction):
    numberActions = len(actions)
    numberStates = len(states)
    # the warehouse state repeats in an iterval for each operation
    operationInterval = len(states) / numberOperations
    T = np.zeros((numberActions, numberStates, numberStates))
    R = np.zeros((numberActions, numberStates, numberStates))

    for i in range(numberActions):
        # current action which is the field we take
        action = actions[i]
        for j in range(len(states)):
            curState = states[j]           
            operation = curState[0]
            curWarehouseState = curState[1:].copy()
            # check if we are in an invalid scenario => no reward and do not change warehouse state
            if (curWarehouseState[i] != 'EMPTY' and operation[0] == 'STORE') \
                or (curWarehouseState[i] != operation[1] and operation[0] == 'RESTORE'):
                for k in range(numberOperations):
                    nextIndex = int((j + k * operationInterval) % len(states))
                    nextOperation = states[nextIndex][0]
                    T[i, j, nextIndex] =  getTransitionProbabiltiy(nextOperation)
            # valid operation set reward and change warehouse state
            else:
                nextWarehouseState = curWarehouseState
                if operation == 'STORE':
                    nextWarehouseState[i] = operation[1]
                else:
                    nextWarehouseState[i] = 'EMPTY'
                nextState = [operation] + nextWarehouseState
                nextIndexStart = states.index(nextState)
                for k in range(numberOperations):
                    nextIndex = int((nextIndexStart + operationInterval*k) % len(states))
                    nextOperation = states[nextIndex][0]
                    T[i, j, nextIndex] = getTransitionProbabiltiy(nextOperation)
                    R[i, j, nextIndex] = rewardFunction[i]   
    return T, R

rewardFunction = getSimpleReward(warehouseFields)
T, R = createTransitionAndRewardMatrix(actions, states, len(operationsWithItems), rewardFunction)
print(np.shape(T))
test = np.sum(T[0][0])
print(test)

(4, 1536, 1536)
1.0


Finally create the mdp models and evaluate the different classes

In [78]:
# 1. Policy Iteration
mdpWarehousePolicy = mdptoolbox.mdp.PolicyIteration(T, R, 0.1, max_iter=100)
# Run the MDP
mdpWarehousePolicy.run()

# just show the first 20 entries of the matrices
print('PolicyIteration:')
print(mdpWarehousePolicy.policy[0:20])
print(mdpWarehousePolicy.V[0:20])
print(mdpWarehousePolicy.iter)

PolicyIteration:
(0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 2, 2, 2, 2, 0, 0, 0, 3)
(0.014603873152815833, 0.016166200060012882, 0.01779896409443149, 0.13180344388096357, 0.0181187495384474, 0.018138230885186263, 0.021309325509641642, 0.13523791840022625, 0.02179246861588927, 0.023350280552664904, 0.021874872158246024, 0.13882788131383167, 0.2783025481355866, 0.27978447317641475, 0.2813334810469968, 0.27830225679367804, 0.0181187495384474, 0.018138230885186263, 0.021309325509641642, 0.13523791840022625)
1


In [79]:
# 2. QLearning
mdpWarehouseQ = mdptoolbox.mdp.QLearning(T, R, 0.1)
# Run the MDP
mdpWarehouseQ.run()

print('Q learning:')
print(mdpWarehouseQ.policy[0:20]) # I think this chooses 0 way to often, might be an error in the model
print(mdpWarehouseQ.V[0:20])

Q learning:
(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.7074934631172406e-06, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)


In [80]:
# 3. ValueIteration
mdpWarehouseValueIter = mdptoolbox.mdp.ValueIteration(T, R, 0.1, max_iter=100)
# Run the MDP
mdpWarehouseValueIter.run()

print('Value Iteration:')
print(mdpWarehouseValueIter.policy[0:20])
print(mdpWarehouseValueIter.V[0:20])
print(mdpWarehouseValueIter.iter)

Value Iteration:
(0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 2, 2, 2, 2, 0, 0, 0, 3)
(0.012584077289959646, 0.013936108053755114, 0.015388691859280096, 0.12925142336907042, 0.01562614650849945, 0.01562614650849945, 0.018430761077819904, 0.13229349258761022, 0.018894460070930662, 0.02024649083472613, 0.018894460070930662, 0.13556180615004143, 0.2750856059679589, 0.27643763673175437, 0.27789022053727935, 0.2750856059679589, 0.01562614650849945, 0.01562614650849945, 0.018430761077819904, 0.13229349258761022)
2


Now we evaluate the models and compare the needed steps for each model and also compare them to a greedy approach.

In [81]:
# get all test actions form the test file in a list
def getTestActions(warehouseorder):
    actionList = []
    for line in warehouseorder:
        split = line.split('\t')
        curOperation = split[0].upper()
        curItem = split[1].strip('\n').upper()
        curAction = (curOperation, curItem)
        actionList.append(curAction)
    return actionList

# evaluate a greedy approach, always store/restore at the nearest possible field
def greedyStorage(actionList, stepsTaken):
    curState = ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']
    steps = 0

    for (operation, item) in actionList:
        # case operation not possible
        if ((operation == 'STORE') and ('EMPTY' not in curState)) \
            or ((operation == 'RESTORE') and (item not in curState)):
            print("does this happen?") # => with our test data this is never the case
        else:
            if operation == 'STORE':
                # store in the nearest field
                for i in range(0, len(curState)):
                    if curState[i] == 'EMPTY':
                        curState[i] = item
                        steps += stepsTaken[i]
                        break
            # restore operation
            else:
                for i in range(0, len(curState)):
                    if curState[i] == item:
                        curState[i] = 'EMPTY'
                        steps += stepsTaken[i]
                        break
    return steps    

stepsTaken = []
for (x, y) in warehouseFields:
    stepsTaken.append(x+y+1)

warehouseorder = open('Exercise4_warehousetraining2x2.txt')
actionList = getTestActions(warehouseorder)
greedySteps = greedyStorage(actionList, stepsTaken)
print(greedySteps)

14401


In [72]:
# evaluate how much steps our mdp models will take:
def evaluateMDPModel(mdpModel, actions, actionList):
    steps = 0
    policy = mdpModel.policy
    actionSteps = [1,2,2,3]
    curState = ['EMPTY', 'EMPTY', 'EMPTY', 'EMPTY']
    # iterate through actions, transition via policy
    for action in actionList:
        # get index of the current state
        curStateIndex = states.index([action] + curState)
        # select the action based on the policy
        actionField = policy[curStateIndex]
        steps += actionSteps[actionField]
        # change warehouse state for the next action
        (operation, item) = action
        if operation == 'STORE':
            # only happens with qlearning, policy and value iteration only take valid actions
            #if curState[actionField] != 'EMPTY':
            #     print("not allowed action store")
            curState[actionField] = item
        else:
            #if curState[actionField] != item:
            #    print("not allowed action restore")
            curState[actionField] = 'EMPTY'
    return steps    

print("Greedy approach steps:" + str(greedySteps))

policyIterationSteps = evaluateMDPModel(mdpWarehousePolicy, actions, actionList)
print("Policy iteration steps:" + str(policyIterationSteps))

valueIterationSteps = evaluateMDPModel(mdpWarehouseValueIter, actions, actionList)
print("Value iteration steps:" + str(valueIterationSteps))

# Qlearning cheats, it takes options which are not valid as it is configured right now
QlearningSteps = evaluateMDPModel(mdpWarehouseQ, actions, actionList)
print("Q learning steps:" + str(QlearningSteps))

Greedy approach steps:14401
Policy iteration steps:14401
Value iteration steps:14401
Q learning steps:9137
