# SAKI Ex04
## import libs and load training/test files

In [45]:
import mdptoolbox
import pandas as pd
import itertools as it
import numpy as np
from scipy import sparse
import enum 

test_data = pd.read_csv('Exercise 4 - Reinforcement Learning Data - warehouseorder.txt', sep='\t', header=None)
test_data.columns = ["action","color"]

training_data = pd.read_csv('Exercise 4 - Reinforcement Learning Data - warehousetraining.txt', header=None)
training_data.columns = ["training"]

## Enum types

In [46]:
class Color(enum.Enum): 
    red = 1
    white = 2
    blue = 3

## Enviroment

In [47]:
def get_warehouse(num_moves, warehouse_description, num_fields):
    warehouse = []
    for move in range(num_moves):
        for warehouse_state in it.product(warehouse_description, repeat=num_fields):
            states = []
            state_string = []
            for i in range(num_fields):
                state_string.append('state'+str(i))
                states.append(warehouse_state[i])

            state_string.append('NextMove')
            states.append(move)
            warehouse.append(states)

    return pd.DataFrame(warehouse, columns=state_string)

## Transition Probability Matrix 

In [48]:
def get_probs_matrix(num_actions, num_moves, warehouse_desc, num_fields, block_size):
    move_probs = (training_data.training.value_counts() / training_data.shape[0])
    move_probs = move_probs.round(3)
    
    TPM = []       
    for action in range(num_actions):
        TPM.append(np.zeros((num_state, num_state),dtype=np.float16))
        cur_i = 0
        for instr in range(num_moves):
            for warehouse_state in it.product(warehouse_desc, repeat=num_fields):
                for move in range(num_moves):
                    state_index = (cur_i % block_size) + (block_size * move)
                    tmpa = (cur_i % block_size) + (block_size * move)
                    tmpb = num_color**(num_actions - action - 1)
                    
                    if (instr in range(3)):
                        if(warehouse_state[action] != 0):
                            TPM[action][cur_i][state_index] = move_probs[move]
                        else:
                            if(instr == 0):
                                TPM[action][cur_i][tmpa + tmpb] = move_probs[move]
                            elif(instr == 1):
                                TPM[action][cur_i][tmpa + tmpb * 2] = move_probs[move]
                            elif(instr == 2):
                                TPM[action][cur_i][tmpa + tmpb * 3] = move_probs[move]
                    else:
                        if(warehouse_state[action] == 0):
                            TPM[action][cur_i][state_index] = move_probs[move]
                        else:
                            if(instr == 5):
                                TPM[action][cur_i][tmpa - tmpb * 3] = move_probs[move] 
                            elif(instr == 4):
                                TPM[action][cur_i][tmpa - tmpb * 2] = move_probs[move]
                            elif(instr == 3):
                                TPM[action][cur_i][tmpa - tmpb] = move_probs[move]
                           
                cur_i += 1
        TPM[action] = sparse.csr_matrix(TPM[action])
    return TPM

## Reward

In [49]:
def get_adder(action):
    if  (action == 0): adder = 1000;
    elif(action == 1): adder = 600;
    elif(action == 2): adder = 300;
    elif(action == 3): adder = 100;
    elif(action == 4): adder = 50;
    elif(action == 5): adder = 25;
    
    return action

In [50]:
def get_reward(num_actions, num_state, wh):
    reward = []
    for action in range(num_actions):
        reward.append(np.zeros((num_state, )))
        for index, warehouse in wh.iterrows():
            try:
                if(warehouse.NextMove in range(3) and warehouse[action] == 0):
                    adder = get_adder(action)
                elif warehouse.NextMove in range(3, 6) and (warehouse[action] == (warehouse.NextMove - 2)):
                    adder = get_adder(action) + 50
                else:
                    if warehouse.NextMove in range(3):
                         adder = 20
                    else:
                        adder = -10
                reward[-1][index] = adder
            except:
                print("An exception occurred") 
                print(warehouse.NextMove)

    reward = np.asarray(reward)
    reward = reward.transpose()
    return reward

## global functions

In [51]:
def getInstrNumber(inpt):
    if(inpt == 'store red'):
        value = 0
    elif(inpt == 'store white'):
        value = 1
    elif(inpt == 'store blue'):
        value = 2
    elif(inpt == 'restore red'):
        value = 3
    elif(inpt == 'restore white'):
        value = 4
    elif(inpt == 'restore blue'):
        value = 5  
    return value
    
def getColorNumber(color):
    if(color == 'red'):
         value = 1
    elif(color == 'white'):
        value = 2
    elif(color == 'blue'):
        value = 3 
    return value

def getWarehouseIndex(ground, instr, warehouse):
    return warehouse[(warehouse['state0'] == ground[0]) &  (warehouse['state1'] == ground[1]) &
                    (warehouse['state2'] == ground[2]) & (warehouse['state3'] == ground[3]) &
                    (warehouse['state4'] == ground[4]) &  (warehouse['state5'] == ground[5]) &
                    (warehouse['NextMove'] == getInstrNumber(instr))].index[0]

def getLength(index):
    length = 0
    if(index == 0):
        length = 2
    elif(index == 1):
        length = 4
    elif(index == 2):
        length = 4
    elif(index == 3):
        length = 6
    elif(index == 4):
        length = 6
    elif(index == 5):
        length = 8

    return length

## save steps in evaluation.txt file

In [52]:
outputeval = open('evaluate.txt', 'w') 
def store_restore_mlp_policy(policy, data, warehouse_policyiter):
    print(str(counter) + " PolicyIteration: \t" + str(warehouse_policyiter) + (data.action + ' ' + data.color), file = outputeval)
    #print("PolicyIteration")
    index = getWarehouseIndex(warehouse_policyiter, (data.action + ' ' + data.color), warehouse)
    if(data.action == 'store'):
        warehouse_policyiter[policy.policy[index]] = getColorNumber(data.color)
    elif(data.action == 'restore'):
        warehouse_policyiter[policy.policy[index]] = 0
        
    return getLength(policy.policy[index])

def store_restore_mlp_value(policy, data, warehouse_valueiter):
    #print("ValueIteration")
    print(str(counter) + " ValueIteration: \t" + str(warehouse_valueiter) + (data.action + ' ' + data.color), file = outputeval)
    index = getWarehouseIndex(warehouse_valueiter, (data.action + ' ' + data.color), warehouse)
    if(data.action == 'store'):
        warehouse_valueiter[policy.policy[index]] = getColorNumber(data.color)
    elif(data.action == 'restore'):
        warehouse_valueiter[policy.policy[index]] = 0
        
    return getLength(policy.policy[index])


## caculate steps for each algorithms
### Test Data

In [53]:
#variables
num_fields = 6 
num_color = 4 
num_moves = 6
num_actions = num_fields
block_size = num_color ** num_fields

num_state = num_color ** num_fields * num_moves
warehouse_description=[0,1,2,3]

warehouse_policy_interation=np.zeros(num_fields)
warehouse_value_iteration=np.zeros(num_fields)

length_mlp_value = 0
length_mlp_policy = 0
counter = 0

# training
warehouse = get_warehouse(num_moves, warehouse_description, num_fields)
P = get_probs_matrix(num_actions, num_moves, warehouse_description, num_fields, block_size)
R = get_reward(num_actions, num_state,warehouse)
mdpresultPolicy = mdptoolbox.mdp.PolicyIteration(P, R, 0.9, max_iter=100)
mdpresultValue = mdptoolbox.mdp.ValueIteration(P, R, 0.9, max_iter=100)

mdpresultPolicy.run()
mdpresultValue.run()

# evaluation
for data in test_data.itertuples():
    length_mlp_value += store_restore_mlp_value(mdpresultValue, data, warehouse_value_iteration)
    length_mlp_policy += store_restore_mlp_policy(mdpresultPolicy, data, warehouse_policy_interation)
    print('', file = outputeval)
    counter += 1

print("MLP value iteration:" + str(length_mlp_value))
print("MLP policy iteration:" + str(length_mlp_policy))
outputeval.close()

MLP value iteration:338
MLP policy iteration:338
