In [1]:
import copy
import numpy as np
from itertools import combinations
from itertools import product
import mdptoolbox

### Training Data 2x2:

In [2]:
# check if file exists on given path
import os
filename = "./warehousetraining2x2.txt"
print("Path exists? {}".format(os.path.exists(filename)))

Path exists? True


In [3]:
# read file into a numpy array 
import csv
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter='\t', quoting=csv.QUOTE_NONE)
x = list(reader)
training_data_2_2 = np.array(x).astype("str")
print(training_data_2_2.shape)
print(training_data_2_2)

(8177, 2)
[['store' 'red']
 ['store' 'red']
 ['store' 'red']
 ...
 ['restore' 'red']
 ['restore' 'red']
 ['restore' 'white']]


### Test Data 2x2:

In [23]:
import os
filename = "./warehouseordernew.txt"
print("Path exists? {}".format(os.path.exists(filename)))

Path exists? True


In [24]:
import csv
raw_data = open(filename, 'rt')
reader = csv.reader(raw_data, delimiter='\t', quoting=csv.QUOTE_NONE)
x = list(reader)
test_data_2_2 = np.array(x).astype("str")
print(test_data_2_2.shape)
#print(test_data_2_2)

(20, 2)


### Define States, Actions, Transition probability Matrix and Reward Matrix

In [37]:
import copy
import numpy as np
from itertools import combinations
from itertools import product


class T_R_prepare:
    classes = {'red': 1, 'blue': 2, 'white': 3}
    state_action = {'store': 5, 'restore': 8}
    combinations = None
    actions = np.array([[5, 1], [5, 2], [5, 3], [8, 1], [8, 2], [8, 3]])
    Action_dict = None

    # positions as [#x, #y], initial_training_state as [state_action, classes]
    def __init__(self, initial_training_state, positions, number_of_positions, number_of_classes, training_data,
                 test_data):

        self.num_pos = number_of_positions
        self.pos = positions
        self.num_class = number_of_classes
        self.init_training = initial_training_state
        self.training_data = training_data
        self.test_data = test_data
        self.num_comb = np.power((self.num_class + 1), self.num_pos)
        # choose position were to store or restore item
        self.Action = np.arange(self.num_pos)
        return None

    def Action_dict(self):
        x, y = self.pos
        positions = np.ogrid[1:(x + 1), 1:(y + 1)]
        x, y = positions

        position_elem = []
        for i in range(len(y[0])):
            for j in range(len(x)):
                position_elem.append([x[j][0], y[0][i]])

        position_elem = np.array(position_elem)
        a_dict = dict(zip(self.Action, position_elem))

        return a_dict

    # takes training data in array-format ['store/restore', 'red'] and performs the optimal action on the actual state
    # returning the next state and the performed action - for example: ['store/restore', 'red'] --> [0, 0, 0, 0] --> [1, 0, 0, 0], (1,1)
    # action is choosing position element of: {(1,1), (1,2), (2,1), ...} - position in warehouse
    def store_restore(self, array):

        if (array[len(array) - 2] == 5):
            if (np.isin(0, array[:len(array) - 2])):
                pos = np.sort(np.where(array == 0))[0][0]
                array[pos] = array[len(array) - 1]
            
            else:
                pos = self.num_pos

        else:
            if (np.isin(array[len(array) - 1], array[:len(array) - 2])):
                pos = np.sort(np.where(array == array[len(array) - 1]))[0][0]
                array[pos] = 0
                
            else:
                pos = self.num_pos

        output = array[:len(array) - 2]
        
        return output, pos

    # iterates through given data and performs store_restore method for every state in the data
    # output: array [[0,0,0,0,5,1],1]       array [[0,0,0,0]
    #                [1,0,0,0,5,2],2]              [1,0,0,0]
    #                [1,2,0,0,5,3],1] ...],        [1,2,0,0] ...]
    def data(self, data):

        states = []
        combinations = []

        states.append(np.concatenate((np.zeros(self.num_pos), self.init_training)).astype(int))
        combinations.append(np.zeros(self.num_pos).astype(int))

        init = np.concatenate((np.zeros(self.num_pos), self.init_training))

        for el in data[-(len(data) - 1):]:
            next_combi, action = self.store_restore(init)
            combinations.append(np.array(next_combi).astype(int))

            next_state_action = [self.state_action[el[0]], self.classes[el[1]]]
            next_state = np.concatenate((next_combi, next_state_action))

            states.append(np.array(action).astype(int))
            states.append(np.array(next_state).astype(int))
            init = next_state

        # appending 9 which states for the final state
        last_state = copy.copy(states[-1])
        
        _, last_pos = self.store_restore(last_state)
        
        states.append(np.array(last_pos).astype(int))

        states = np.array(states)
        states = states.reshape((int(len(states) / 2), 2))

        combinations = np.array(combinations)
        self.combinations = copy.copy(combinations)

        return states, combinations

    def get_training_data(self):
        return self.data(self.training_data)

    def get_test_data(self):
        return self.data(self.test_data)

    def states(self):
        actions_tile = np.tile(self.actions, (self.num_comb, 1))
        states = product([0, 1, 2, 3], repeat = self.num_pos)
        states = np.array(list(states))
        states = np.repeat(states, 6, axis=0)
        states = np.concatenate((states, actions_tile), axis=1)
        return states

    # computes apriori probabilities of the store/restore actions based on the given training data
    def apriori_probs(self):
        training_states, _ = self.get_training_data()
        apriori = self.actions
        apriori_probs = []
        for el in apriori:
            p = 0

            for i in range(len(training_states)):
                if (el == training_states[i][0][-2:]).all():
                    p = p + 1

            p = p / len(training_states)
            apriori_probs.append(p)

        apriori_probs = np.array(apriori_probs)

        assert np.sum(apriori_probs) == 1

        apriori_probs_dict = dict(zip(self.totuple(apriori), self.totuple(apriori_probs)))

        return apriori_probs_dict

    # computes the conditional probabilities of the store/restore actions based on the given training data and apriori probabilities
    def conditional_probs(self):

        probs = []
        training_prob = np.concatenate((np.repeat(self.actions, 6, axis=0), np.tile(self.actions, (6, 1))),
                                       axis=1).reshape((36, 2, 2))
        training_states, _ = self.get_training_data()

        apriori_dict = self.apriori_probs()
        for el in training_prob:
            p = 0

            for i in range(len(training_states) - 1):
                if ((el[0] == training_states[i][0][-2:]).all() and (el[1] == training_states[i + 1][0][-2:]).all()):
                    p = p + 1

            p = p / np.multiply(len(training_states) - 1, apriori_dict[self.totuple(el[0])])
            probs.append(p)

        probs = np.array(probs)
        probs_check = probs.reshape(6, 6)

        # normieren
        for i in range(6):
            probs_check[i] = probs_check[i] / np.sum(probs_check[i])

        probs = probs_check.reshape(36, 1)
        probs = np.squeeze(probs)

        assert (np.sum(probs_check, axis=1) == 1).all()

        training_prob_dict = dict(zip(self.totuple(training_prob), self.totuple(probs)))

        return training_prob_dict

    # from https://stackoverflow.com/questions/10016352/convert-numpy-array-to-tuple/10016379 - totuple function
    def totuple(self, a):
        try:
            return tuple(self.totuple(i) for i in a)
        except TypeError:
            return a



In [38]:
# initialize object of class T_R_prepare
states_actions_2_2 = T_R_prepare(np.array([5, 1]), [2, 2], 4, 3, training_data_2_2, test_data_2_2)
num_pos = states_actions_2_2.num_pos
print(num_pos)

4


#### States:

In [8]:
# get all possible states
states = states_actions_2_2.states()
print(states)

[[0 0 0 0 5 1]
 [0 0 0 0 5 2]
 [0 0 0 0 5 3]
 ...
 [3 3 3 3 8 1]
 [3 3 3 3 8 2]
 [3 3 3 3 8 3]]


#### Actions:

In [9]:
# define Actions
Action_dict = states_actions_2_2.Action_dict()
Action = states_actions_2_2.Action
print(Action_dict)
print(Action)

{0: array([1, 1]), 1: array([2, 1]), 2: array([1, 2]), 3: array([2, 2])}
[0 1 2 3]


#### Training Data and Probabilities:

In [10]:
# get the training states - greedy
training_states, combinations = states_actions_2_2.get_training_data()
print(training_states)

[[array([0, 0, 0, 0, 5, 1]) array(0)]
 [array([1, 0, 0, 0, 5, 1]) array(1)]
 [array([1, 1, 0, 0, 5, 1]) array(2)]
 ...
 [array([1, 3, 1, 3, 8, 1]) array(0)]
 [array([0, 3, 1, 3, 8, 1]) array(2)]
 [array([0, 3, 0, 3, 8, 3]) array(1)]]


In [11]:
# get the apriori probabilities 
apriori_probs = states_actions_2_2.apriori_probs()
print(apriori_probs)

{(5, 1): 0.25241531123884065, (5, 2): 0.12168276874159227, (5, 3): 0.12596306713953773, (8, 1): 0.25241531123884065, (8, 2): 0.12168276874159227, (8, 3): 0.12584077289959644}


In [12]:
# get the conditional probabilities
conditional_probs = states_actions_2_2.conditional_probs()
print(conditional_probs)

{((5, 1), (5, 1)): 0.2877906976744186, ((5, 1), (5, 2)): 0.14098837209302326, ((5, 1), (5, 3)): 0.14292635658914732, ((5, 1), (8, 1)): 0.30765503875968997, ((5, 1), (8, 2)): 0.05765503875968993, ((5, 1), (8, 3)): 0.06298449612403101, ((5, 2), (5, 1)): 0.2693467336683417, ((5, 2), (5, 2)): 0.12462311557788944, ((5, 2), (5, 3)): 0.1577889447236181, ((5, 2), (8, 1)): 0.18592964824120606, ((5, 2), (8, 2)): 0.1949748743718593, ((5, 2), (8, 3)): 0.06733668341708543, ((5, 3), (5, 1)): 0.2883495145631068, ((5, 3), (5, 2)): 0.13592233009708737, ((5, 3), (5, 3)): 0.14466019417475728, ((5, 3), (8, 1)): 0.18155339805825244, ((5, 3), (8, 2)): 0.07378640776699029, ((5, 3), (8, 3)): 0.17572815533980582, ((8, 1), (5, 1)): 0.21124031007751942, ((8, 1), (5, 2)): 0.10077519379844962, ((8, 1), (5, 3)): 0.08720930232558141, ((8, 1), (8, 1)): 0.2853682170542636, ((8, 1), (8, 2)): 0.15164728682170542, ((8, 1), (8, 3)): 0.16375968992248063, ((8, 2), (5, 1)): 0.24623115577889446, ((8, 2), (5, 2)): 0.1065326633

### 1. Definition of T and R

#### Transition-Probability-Matrix:

In [16]:
num_comb = states_actions_2_2.num_comb
print(num_comb)

256


In [20]:
# computes the transition probability from one state to another state after performing a specific action
def P(states_i, states_j, action_k):
        
        #print(states_i)
        
        cond_prob_ij = conditional_probs[((states_i[len(states_i) - 2], states_i[len(states_i) - 1]),
                                           (states_j[len(states_j) - 2], states_j[len(states_j) - 1]))]

        if (states_i[len(states_i) - 2] == 5):
            if (states_i[action_k] == 0                        and
             states_i[len(states_i) - 1] == states_j[action_k] and
            (states_i[:action_k] == states_j[:action_k]).all() and
            (states_i[(action_k + 1):-2] == states_j[(action_k + 1):-2]).all()) or \
            (states_i[action_k] != 0 and (states_i[:-2] == states_j[:-2]).all()):
                P_ij = 1 * cond_prob_ij

            else:
                P_ij = 0

        else:
            if (states_i[action_k] != 0 and states_j[action_k] == 0 and
             states_i[len(states_i) - 1] == states_i[action_k]      and
            (states_i[:action_k] == states_j[:action_k]).all()      and
            (states_i[(action_k + 1):-2] == states_j[(action_k + 1):-2]).all())  or \
            (states_i[action_k] == 0 and (states_i[:-2] == states_j[:-2]).all()) or \
            (states_i[action_k] != 0 and (states_i[:-2] == states_j[:-2]).all()  and
             states_i[len(states_i) - 1] != states_i[action_k]):
                P_ij = 1 * cond_prob_ij

            else:
                P_ij = 0

        return P_ij

In [21]:
# define the skeleton of T
T1 = np.zeros((num_comb*6, num_comb*6))
T2 = np.zeros((num_comb*6, num_comb*6))
T3 = np.zeros((num_comb*6, num_comb*6))
T4 = np.zeros((num_comb*6, num_comb*6))
T1n = np.zeros((num_comb*6, num_comb*6))
T2n = np.zeros((num_comb*6, num_comb*6))
T3n = np.zeros((num_comb*6, num_comb*6))
T4n = np.zeros((num_comb*6, num_comb*6))

# Transition Probability Matrix for action doing nothing
for i in range(num_comb*6):
    for j in range(num_comb*6):
        
        cond_prob_ij = conditional_probs[((states[i][len(states[i]) - 2], states[i][len(states[i]) - 1]),
                                           (states[j][len(states[j]) - 2], states[j][len(states[j]) - 1]))]
        
        if (states[i][:-2] == states[j][:-2]).all():
            T1n[i][j] = cond_prob_ij
            T2n[i][j] = cond_prob_ij
            T3n[i][j] = cond_prob_ij
            T4n[i][j] = cond_prob_ij

T = np.vstack(([T1], [T2], [T3], [T4], [T1n], [T2n], [T3n], [T4n]))
print(T.shape)

(8, 1536, 1536)


In [22]:
# fill T
for k in range(len(Action)):
    for i in range(num_comb*6):
        for j in range(num_comb*6):
            T[k][i][j] = P(states[i], states[j], Action[k])



In [1448]:
np.save('T2x2.npy', T)

#### Reward:

In [25]:
def r(state_i, action_k):

    actionk = np.sum(Action_dict[action_k])

    if (state_i[-2] == 5):
        if (state_i[action_k] == 0):
            R_i = 1/actionk*10
            R_n = -100
                
        elif(state_i[:num_pos] != 0).all() or (state_i[action_k] != 0):
            R_i = -100 
            R_n = 1
                
        else:
            R_i = -1
            R_n = -10
    else:
        if (state_i[action_k] != 0 and state_i[-1] == state_i[action_k]):
            R_i = 1/actionk*10
            R_n = -100
                
        elif (state_i[action_k] != 0 and state_i[-1] != state_i[action_k]) or (state_i[:num_pos] == 0).all():
            R_i = -100 
            R_n = 1
                
        else:
            R_i = -1 
            R_n = -10

    return R_i, R_n

In [26]:
R = np.zeros((num_comb*6, 2*len(Action)))

for i in range(num_comb*6):
    for k in range(len(Action)):
        R_ik, R_n = r(states[i], Action[k])
        #print(R_in)
        R[i][k] = R_ik
        R[i][k+4] = R_n
        
print(R)

[[   5.            3.33333333    3.33333333 ... -100.
  -100.         -100.        ]
 [   5.            3.33333333    3.33333333 ... -100.
  -100.         -100.        ]
 [   5.            3.33333333    3.33333333 ... -100.
  -100.         -100.        ]
 ...
 [-100.         -100.         -100.         ...    1.
     1.            1.        ]
 [-100.         -100.         -100.         ...    1.
     1.            1.        ]
 [   5.            3.33333333    3.33333333 ... -100.
  -100.         -100.        ]]


### 2. Definition of T and R

#### Transition Probability Matrix:

In [None]:
# define the skeleton of T
T1 = np.zeros((num_comb*6, num_comb*6))
T2 = np.zeros((num_comb*6, num_comb*6))
T3 = np.zeros((num_comb*6, num_comb*6))
T4 = np.zeros((num_comb*6, num_comb*6))
T5 = np.zeros((num_comb*6, num_comb*6))

# Transition Probability Matrix for action doing nothing
for i in range(num_comb*6):
    for j in range(num_comb*6):
        
        cond_prob_ij = conditional_probs[((states[i][len(states[i]) - 2], states[i][len(states[i]) - 1]),
                                           (states[j][len(states[j]) - 2], states[j][len(states[j]) - 1]))]
        
        if (states[i][:-2] == states[j][:-2]).all():
            T5[i][j] = cond_prob_ij


T = np.vstack(([T1], [T2], [T3], [T4], [T5]))
print(T.shape)

In [None]:
# fill T
for k in range(len(Action)):
    for i in range(num_comb*6):
        for j in range(num_comb*6):
            T[k][i][j] = P(states[i], states[j], Action[k])

#### Reward:

In [1571]:
def r(state_i, action_k):
        
    if action_k < num_pos:
            
        actionk = np.sum(Action_dict[action_k])
            
        if (state_i[-2] == 5):
            if (state_i[action_k] == 0):
                R_i = 1/actionk*2
                
            elif(state_i[:num_pos] != 0).all() or (state_i[action_k] != 0):
                R_i = -100
                
            else:
                R_i = -5

        else:
            if (state_i[action_k] != 0 and state_i[-1] == state_i[action_k]):
                R_i = 1/actionk*2
                                
            elif (state_i[action_k] != 0 and state_i[-1] != state_i[action_k]) or (state_i[:num_pos] == 0).all():
                R_i = -100 
                
            else:
                R_i = -5
        
    else:
            
        if (state_i[-2] == 5):
            if (state_i[:num_pos] == 0).any():
                R_i = -50
                
            elif(state_i[:num_pos] != 0).all():
                R_i = -30
                
            else:
                R_i = -50

        else:
            if (state_i[:num_pos] != 0).any():
                R_i = -50
                                
            elif (state_i[:num_pos] == 0).all():
                R_i = -30
                
            else:
                R_i = -50
            
            
    return R_i

In [1572]:
Action_R = np.arange(5)
R = np.zeros((num_comb*6, len(Action_R)))

for i in range(num_comb*6):
    for k in range(len(Action_R)):
        R_ik = r(states[i], Action_R[k])
        R[i][k] = R_ik
        
print(R[1076])

[-100.         -100.            0.66666667 -100.          -50.        ]


In [1416]:
num_comb = states_actions_2_2.num_comb
print(num_comb)

256


### Optimization 2x2

In [1452]:
T = np.load('T2x2.npy')

In [1530]:
mdptoolbox.util.check(T, R)

#### Policy Iteration:

In [1950]:
pi = mdptoolbox.mdp.PolicyIteration(T, R, 0.9999999, max_iter = 15360)
pi.run()
policy_pi = pi.policy
np.save('pi2x2.npy', pi)

In [1951]:
print(policy_pi)

(0, 0, 2, 4, 4, 4, 0, 0, 2, 3, 7, 7, 0, 0, 2, 7, 3, 7, 0, 0, 2, 7, 7, 3, 0, 0, 1, 2, 6, 6, 0, 1, 1, 2, 6, 6, 0, 0, 1, 2, 3, 6, 0, 0, 1, 2, 6, 3, 0, 3, 1, 6, 2, 6, 0, 0, 1, 3, 2, 6, 0, 6, 1, 6, 2, 6, 0, 0, 1, 6, 2, 3, 0, 0, 3, 6, 6, 2, 0, 0, 1, 3, 6, 2, 0, 0, 1, 6, 3, 2, 0, 0, 6, 6, 6, 2, 0, 0, 2, 1, 5, 5, 0, 2, 2, 1, 5, 5, 0, 0, 2, 1, 3, 5, 0, 0, 2, 1, 5, 3, 0, 0, 0, 2, 5, 5, 0, 0, 0, 2, 5, 5, 0, 0, 0, 1, 3, 5, 0, 0, 0, 2, 5, 3, 0, 3, 0, 1, 2, 5, 0, 5, 0, 1, 2, 5, 0, 5, 0, 1, 2, 5, 0, 5, 0, 1, 2, 3, 0, 0, 3, 1, 5, 2, 0, 0, 5, 1, 5, 2, 0, 0, 5, 1, 3, 2, 0, 0, 5, 1, 5, 2, 0, 3, 2, 5, 1, 5, 0, 0, 2, 3, 1, 5, 0, 5, 2, 5, 1, 5, 0, 0, 2, 5, 1, 3, 0, 3, 0, 2, 1, 5, 0, 5, 0, 2, 1, 5, 0, 5, 0, 2, 1, 5, 0, 5, 0, 2, 1, 3, 0, 5, 0, 5, 1, 5, 0, 5, 0, 3, 2, 5, 0, 5, 0, 5, 1, 5, 0, 5, 5, 5, 1, 3, 0, 3, 3, 5, 1, 2, 0, 5, 5, 3, 1, 2, 0, 5, 5, 5, 1, 2, 0, 5, 5, 5, 1, 2, 0, 0, 3, 5, 5, 1, 0, 0, 2, 3, 5, 1, 0, 0, 2, 5, 3, 1, 0, 0, 5, 5, 5, 1, 0, 0, 3, 2, 5, 1, 0, 0, 5, 2, 5, 1, 0, 0, 5, 2, 3, 1, 0, 0, 5, 

#### Value Iteration:

In [27]:
vi = mdptoolbox.mdp.ValueIteration(T, R, 0.9999999, max_iter = 15360)
vi.run()
policy_vi = vi.policy

In [28]:
print(policy_vi)

(0, 0, 1, 4, 4, 4, 0, 0, 1, 3, 7, 7, 0, 0, 1, 7, 3, 7, 0, 0, 1, 7, 7, 3, 0, 0, 1, 2, 6, 6, 0, 1, 1, 2, 6, 6, 0, 0, 1, 2, 3, 6, 0, 0, 1, 2, 6, 3, 0, 3, 1, 6, 2, 6, 0, 0, 1, 3, 2, 6, 0, 6, 1, 6, 2, 6, 0, 0, 1, 6, 2, 3, 0, 0, 3, 6, 6, 2, 0, 0, 1, 3, 6, 2, 0, 0, 1, 6, 3, 2, 0, 0, 6, 6, 6, 2, 0, 0, 2, 1, 5, 5, 0, 2, 2, 1, 5, 5, 0, 0, 2, 1, 3, 5, 0, 0, 2, 1, 5, 3, 0, 0, 0, 1, 5, 5, 0, 0, 0, 1, 5, 5, 0, 0, 0, 1, 3, 5, 0, 0, 0, 1, 5, 3, 0, 3, 0, 1, 2, 5, 0, 5, 0, 1, 2, 5, 0, 5, 0, 1, 2, 5, 0, 5, 0, 1, 2, 3, 0, 0, 3, 1, 5, 2, 0, 0, 5, 1, 5, 2, 0, 0, 5, 1, 3, 2, 0, 0, 5, 1, 5, 2, 0, 3, 2, 5, 1, 5, 0, 0, 2, 3, 1, 5, 0, 5, 2, 5, 1, 5, 0, 0, 2, 5, 1, 3, 0, 3, 0, 2, 1, 5, 0, 5, 0, 2, 1, 5, 0, 5, 0, 2, 1, 5, 0, 5, 0, 2, 1, 3, 0, 5, 0, 5, 1, 5, 0, 5, 0, 3, 1, 5, 0, 5, 0, 5, 1, 5, 0, 5, 5, 5, 1, 3, 0, 3, 3, 5, 1, 2, 0, 5, 5, 3, 1, 2, 0, 5, 5, 5, 1, 2, 0, 5, 5, 5, 1, 2, 0, 0, 3, 5, 5, 1, 0, 0, 2, 3, 5, 1, 0, 0, 2, 5, 3, 1, 0, 0, 5, 5, 5, 1, 0, 0, 3, 2, 5, 1, 0, 0, 5, 2, 5, 1, 0, 0, 5, 2, 3, 1, 0, 0, 5, 

### Evaluation 2x2

In [39]:
states_w_Action = np.concatenate((states, np.expand_dims(policy_vi, axis=1)), axis=1)

In [40]:
n=0
test,_ = states_actions_2_2.get_test_data()

for i in range(len(test)):
    for j in range(len(states)):
        
        if ((test[i][0] == states_w_Action[j][:-1]).all() and (test[i][1] == states_w_Action[j][-1]).all()):
            n = n + 1

Accuracy = n/len(test)
print('Accuracy:', Accuracy)

Accuracy: 0.9


In [41]:
def search(array, in_array):
    b = 0
    for i in range(len(in_array)):
        if (array == in_array[i][:-1]).all():
            b = [in_array[i][-1]]
            #print(i)
    return b


In [42]:
import copy

init = copy.copy(states_w_Action[0])
test_states = []
test_states.append(states_w_Action[0])

for i in range(len(test) - 1):

    if (init[-3] == 5):
        if init[-1] < 4:
            init[init[-1]] = init[-2]
    else:
        if init[-1] < 4:
            init[init[-1]] = 0

    new_state = np.concatenate((init[:4], test[i + 1][0][-2:]))
    new_action = search(new_state, states_w_Action)
    new_state_new_action = np.concatenate((new_state, new_action))
    test_states.append(new_state_new_action)
    init = copy.copy(new_state_new_action)

test_states = np.array(test_states)


In [57]:
referenz = 0
actual = 0
for i in range(len(test)):
    if test[i][1] > 3:
        referenz = referenz + 0
    else:
        referenz = referenz + np.sum(Action_dict[int(test[i][1])])

for i in range(len(test_states)):
    if test_states[i][-1] > 3:
        actual = actual + 0
    else:
        actual = actual + np.sum(Action_dict[test_states[i][-1]])

In [58]:
print(actual)
print(referenz)

43
50


In [59]:
print(((referenz - actual) / referenz) * 100, '%')

14.000000000000002 %


In [60]:
print(test)

[[array([0, 0, 0, 0, 5, 1]) array(0)]
 [array([1, 0, 0, 0, 5, 1]) array(1)]
 [array([1, 1, 0, 0, 8, 1]) array(0)]
 [array([0, 1, 0, 0, 5, 2]) array(0)]
 [array([2, 1, 0, 0, 5, 1]) array(2)]
 [array([2, 1, 1, 0, 5, 1]) array(3)]
 [array([2, 1, 1, 1, 8, 1]) array(1)]
 [array([2, 0, 1, 1, 8, 2]) array(0)]
 [array([0, 0, 1, 1, 5, 2]) array(0)]
 [array([2, 0, 1, 1, 5, 1]) array(1)]
 [array([2, 1, 1, 1, 8, 1]) array(1)]
 [array([2, 0, 1, 1, 5, 1]) array(1)]
 [array([2, 1, 1, 1, 8, 1]) array(1)]
 [array([2, 0, 1, 1, 5, 1]) array(1)]
 [array([2, 1, 1, 1, 8, 1]) array(1)]
 [array([2, 0, 1, 1, 5, 1]) array(1)]
 [array([2, 1, 1, 1, 8, 2]) array(0)]
 [array([0, 1, 1, 1, 5, 1]) array(0)]
 [array([1, 1, 1, 1, 8, 1]) array(0)]
 [array([0, 1, 1, 1, 8, 3]) array(4)]]


In [61]:
print(test_states)

[[0 0 0 0 5 1 0]
 [1 0 0 0 5 1 1]
 [1 1 0 0 8 1 0]
 [0 1 0 0 5 2 0]
 [2 1 0 0 5 1 2]
 [2 1 1 0 5 1 3]
 [2 1 1 1 8 1 1]
 [2 0 1 1 8 2 0]
 [0 0 1 1 5 2 1]
 [0 2 1 1 5 1 0]
 [1 2 1 1 8 1 0]
 [0 2 1 1 5 1 0]
 [1 2 1 1 8 1 0]
 [0 2 1 1 5 1 0]
 [1 2 1 1 8 1 0]
 [0 2 1 1 5 1 0]
 [1 2 1 1 8 2 1]
 [1 0 1 1 5 1 4]
 [1 0 1 1 8 1 0]
 [0 0 1 1 8 3 6]]
