In [2]:
import pandas as pd 
import numpy as np 
from random import choice 
from scipy.spatial import distance

df = pd.read_csv('bank_data.csv')
#df = df.head(1000)

#print(df)
#make ratio calculation on each item 
df['ratio'] = df['money'] / df['time (hr)']
ratio = df['ratio'].to_numpy()


#output data to array 
data = df.to_numpy()

# calculate time to travel between each location

coords = df.filter(['x_coordinate', 'y_coordinate']).to_numpy()
dist = distance.squareform(distance.pdist(coords))
time_to_travel = (dist / 30)


# create reward matrix for each state/station 

R = np.matrix(time_to_travel / ratio[:, None])

# Q table
Q = np.matrix(np.zeros([10_000, 10_000]))

# discount factor
gamma = 0.8

In [3]:
# Initial state. (Usually to be chosen at random)
initial_state = 1 #np.random.randint(10000) #try with 0

# This function returns all available actions in the state given as an argument
def available_actions(state):
    current_state_row = R[state, :]
    av_act = np.where(current_state_row > 0)[1]
    return av_act

#get the available actions in the current state 
available_act = available_actions(initial_state)

# This function choose at random which action to be performed 
# within the range of all the available action 
def sample_next_action(available_actions_range):
    next_action = int(np.random.choice(available_act, 1))
    return next_action

#sample next action to be performed 
action = sample_next_action(available_act)

# This function updates the Q matrix according to the path selected and the
# Q learning algo
def update(current_state, action, gamma):
    max_index = np.where(Q[action,] == np.max(Q[action, ]))[1]

    if max_index.shape[0] > 1:
        max_index = int(np.random.choice(max_index, size=1))
    else:
        max_index = int(max_index)
    max_value = Q[action, max_index]
    #print(max_value)

    # Q learning formula
    Q[current_state, action] = R[current_state, action] + gamma * max_value

#update Q matrix 
update(initial_state, action, gamma)

#Training 
# Training over 20_000 iterations. (Re-iterate the process above)
for i in range(60_000):
    current_state = np.random.randint(0, int(Q.shape[0]))
    available_act = available_actions(current_state)
    action = sample_next_action(available_act)
    update(current_state, action, gamma)


# Normalize the trained Q matrix
print ("Trained Q matrix:")
print (Q / np.max(Q) * 100)


Trained Q matrix:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [4]:
high_score = 0
high_steps = []
for i in range(10000):
    current_state = i
    steps = [current_state]
    game_over = False
    score = 0
    time_left = 24
    time_to_exit = distance.euclidean(data[steps[-1]][1:2], [0,0]) / 30

    while game_over is False:

        next_step_index = np.where(Q[current_state,] == np.max(Q[current_state,]))[1]
        
        if next_step_index.shape[0] > 1:
            next_step_index = int(np.random.choice(next_step_index, size = 1))
        else:
            next_step_index = int(next_step_index)

        distance_to_next = distance.euclidean(data[steps][-1][1:2], data[next_step_index][1:2])
        score = score + data[next_step_index][3]
        time_left = time_left - data[next_step_index][4] - (distance_to_next / 30)
        time_to_exit = distance.euclidean(data[steps[-1]][1:2], [0,0]) / 30

        steps.append(next_step_index)
        current_state = next_step_index
        if(time_to_exit > time_left):
            game_over = True
    if (score > high_score):
        high_score = score
        high_steps = steps

# Print selected sequence of steps
print("Selected path:")
print(steps)
print("${:,.2f}".format(score))



Selected path:
[9999, 7560, 8689, 1564, 6309, 3261, 7628, 4643, 1999, 5351, 4024, 2144, 5103, 7730, 2024, 7995, 6345, 4506, 1615, 8104, 6865, 4842]
$305,000.00
