### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [3]:
print(type(Time_matrix))
print(Time_matrix.max())
print(Time_matrix.min())
print(Time_matrix.mean())
print(Time_matrix.var())

<class 'numpy.ndarray'>
11.0
0.0
3.0542857142857143
7.93705306122449


In [4]:
States_track = collections.defaultdict(dict)
Q_dict = collections.defaultdict(dict)

In [5]:
def Q_state(state):
    return '-'.join(str(e) for e in state)

In [6]:
# Initialise states to be tracked
def initialise_tracking_states():
    sample_q_values = [([1,2,3],(1,3)),([3,5,6],(4,2)),([4,10,2],(3,4)), ([2,7,0],(0,4))]    #select any 4 Q-values
    for q_values in sample_q_values:
        state = Q_state(q_values[0])
        action = q_values[1]
        States_track[state][action] = [] 

In [7]:
def save_tracking_states_old():
    """Saves the states to dictionary"""
    for state in States_track.keys():
        for action in States_track[state].keys():
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action].append(Q_dict[state][action])

In [8]:
initialise_tracking_states()

In [9]:
print(States_track)

defaultdict(<class 'dict'>, {'1-2-3': {(1, 3): []}, '3-5-6': {(4, 2): []}, '4-10-2': {(3, 4): []}, '2-7-0': {(0, 4): []}})


In [10]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

#### Tracking the state-action pairs for checking convergence


In [11]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [12]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.07 #0.05 got nothing with init state fixed, 0.5 same nothing, 0.1, 0.8 nothing
        self.epsilon = 1
        self.epsilon_max = 1
        self.epsilon_decay = -0.003 #for 1k
        #self.epsilon_decay = -0.0003 #for 10k
        #self.epsilon_decay = -0.0001 #for 20k
        #self.epsilon_decay = -0.00003 #for 100k
        #self.epsilon_decay = -0.000003 #for 1M
        self.epsilon_min = 0.01
        
        self.batch_size = 32      # for 24*1
        # create replay memory using deque
        self.memory = deque(maxlen=2000)
        self.states_tracked = []
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        input_shape = self.state_size
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state):
        action_indices, action_list = env.requests(state)
        #print(self.epsilon)
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from all possible actions
            # in case of cartpole this will randomly choose an action between 0 and 1
            #print('explore')
            #if (len(action_indices) == 0):
                #print(state)
            action_index = random.randrange(len(action_indices))
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            #state = state.reshape(1, self.state_size)
            #print(env.state_encod_arch1(state).shape)
            #state_input = np.zeros((1, self.state_size))
            #state_input[0] = env.state_encod_arch1(state)
            #print('Greedy')
            #print(state_input[0].shape)
            state = np.array(env.state_encod_arch1(state)).reshape(1, 36)
            #print(state.shape)
            #q_value = self.model.predict(state_input[0])
            q_value = self.model.predict(state)
            #print(q_value.shape)
            #print(q_value)
            q_value =[q_value[0][i] for i in action_indices]
            action_index = np.argmax(q_value[0])
            
        #print(action_indices, action_index)
        #print(len(action_indices))
        #print(self.action_size)
        return action_index, action_list[action_index]
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment

    def append_sample(self, state, action_index, reward, next_state, done):
        self.memory.append((state, action_index, reward, next_state, done))
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
    
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            # initialise two matrices - update_input and update_output
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                #print(state)
                #if (i == 0):
                    #print(type(update_input[i]))
                    #print(update_input[i].shape)
                    #print(type(state))
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # predict the target q-values from states s
            target = self.model.predict(update_input)
            #print(target[0])
            # target for q-network
            target_qval = self.model.predict(update_output)

            # update the target values
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])

            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
    def save_tracking_states(self):
            #print(state.shape)
            #q_value = self.model.predict(state_input[0])
        q_value = self.model.predict(self.track_state)
            #print(q_value.shape)
            #print(q_value)
        #q_value =[q_value[0][i] for i in action_indices]
        self.states_tracked.append(q_value[0][2])

    def save(self, name):
        self.model.save(name)

In [13]:
Episodes = 1

### DQN block

for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    

    #Call the DQN agent
    
    
    while !terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        

In [14]:
state_size = 36
action_size = 21
episode_time = 24*30
n_episodes = 1
n_episodes = 2000
m = 5
t = 24
d = 7
env = CabDriver()
agent = DQNAgent(action_size=action_size, state_size=state_size)


# to store rewards in each episode
rewards_per_episode, episodes = [], []

In [15]:
start_time = time.time()
score_tracked = []
num_hits = 0
for episode in range(n_episodes):

    done = False
    score = 0

    # reset at the start of each episode
    env = CabDriver()
    action_space, state_space, state = env.reset()
    state_size = m+t+d
    action_size = len(action_space)
    #agent = DQNAgent(state_size, action_size)
    #print(state)
    #print(state_space)
    total_time = 0
    step_num = 0
    while not done:
        step_num = step_num + 1
        #print(step_num)
        # get action for the current state and take a step in the environment
        action_index, action = agent.get_action(state)
        if ((state_space.index(state) == 0) and (action_index == 2)):
            num_hits = num_hits + 1
        print(state, action)
        reward, next_state, step_time = env.step(state, action, Time_matrix)
        print(state, next_state, reward)
        total_time += step_time
        if (total_time > episode_time):
            done = True
            reward = 0
        # save the sample <s, a, r, s', done> to the replay memory
        agent.append_sample(state, action_index, reward, next_state, done)

        # train after each step
        agent.train_model()

        # add reward to the total score of this episode
        score += reward
        state = next_state


    # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)

    # epsilon decay
    #agent.epsilon = (1 - 0.0001) * np.exp(agent.epsilon_decay * episode)
    agent.epsilon = agent.epsilon * 0.999 # for 2k
    #(1 - 0.0001) * np.exp(-0.003*i)
    #if agent.epsilon > agent.epsilon_min:
    #    agent.epsilon *= agent.epsilon_decay

    # every episode:
    if (episode % 1 == 0):
        print("episode {0}, reward {1}, memory_length {2}, epsilon {3} total_time {4}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon, total_time))
        agent.save_tracking_states()
        score_tracked.append(score) 

elapsed_time = time.time() - start_time
#save_obj(States_track,'States_tracked')   
print(elapsed_time)
    # every few episodes:
        # store q-values of some prespecified state-action pairs
        # q_dict = agent.store_q_values()

        # save model weights
        #agent.save_model_weights(name="model_weights.h5")


[0, 0, 0] (2, 4)
[0, 0, 0] [4, 3, 0] -6.0
[4, 3, 0] (1, 0)
[4, 3, 0] [0, 16, 0] 16.0
[0, 16, 0] (0, 0)
[0, 16, 0] [0, 17, 0] -5
[0, 17, 0] (1, 4)
[0, 17, 0] [4, 1, 1] -4.0
[4, 1, 1] (0, 3)
[4, 1, 1] [3, 8, 1] 19.0
[3, 8, 1] (0, 1)
[3, 8, 1] [1, 18, 1] 13.0
[1, 18, 1] (0, 0)
[1, 18, 1] [1, 19, 1] -5
[1, 19, 1] (0, 2)
[1, 19, 1] [2, 0, 2] -7.0
[2, 0, 2] (0, 0)
[2, 0, 2] [2, 1, 2] -5
[2, 1, 2] (0, 0)
[2, 1, 2] [2, 2, 2] -5
[2, 2, 2] (3, 0)
[2, 2, 2] [0, 13, 2] 17.0
[0, 13, 2] (0, 3)
[0, 13, 2] [3, 15, 2] 8.0
[3, 15, 2] (1, 3)
[3, 15, 2] [3, 17, 2] -1.0
[3, 17, 2] (1, 2)
[3, 17, 2] [2, 1, 3] 23.0
[2, 1, 3] (1, 4)
[2, 1, 3] [4, 14, 3] -11.0
[4, 14, 3] (0, 0)
[4, 14, 3] [4, 15, 3] -5
[4, 15, 3] (0, 1)
[4, 15, 3] [1, 19, 3] 16.0
[1, 19, 3] (4, 1)
[1, 19, 3] [1, 19, 3] 0.0
[1, 19, 3] (2, 4)
[1, 19, 3] [4, 4, 4] -36.0
[4, 4, 4] (3, 4)
[4, 4, 4] [4, 12, 4] 5.0
[4, 12, 4] (3, 0)
[4, 12, 4] [0, 20, 4] -13.0
[0, 20, 4] (2, 3)
[0, 20, 4] [3, 11, 5] 24.0
[3, 11, 5] (0, 0)
[3, 11, 5] [3, 12, 5] -5
[3,

[1, 16, 3] (0, 2)
[1, 16, 3] [2, 1, 4] 0.0
[2, 1, 4] (1, 2)
[2, 1, 4] [2, 15, 4] -7.0
[2, 15, 4] (0, 3)
[2, 15, 4] [3, 19, 4] -11.0
[3, 19, 4] (4, 2)
[3, 19, 4] [2, 23, 4] -20.0
[2, 23, 4] (2, 3)
[2, 23, 4] [3, 5, 5] 24.0
[3, 5, 5] (1, 2)
[3, 5, 5] [2, 20, 5] 24.0
[2, 20, 5] (0, 1)
[2, 20, 5] [1, 7, 6] -1.0
[1, 7, 6] (1, 4)
[1, 7, 6] [4, 14, 6] 28.0
[4, 14, 6] (4, 3)
[4, 14, 6] [3, 17, 6] 12.0
[3, 17, 6] (1, 3)
[3, 17, 6] [3, 17, 6] 0.0
[3, 17, 6] (1, 3)
[3, 17, 6] [3, 17, 6] 0.0
[3, 17, 6] (4, 0)
[3, 17, 6] [0, 4, 0] 17.0
[0, 4, 0] (3, 1)
[0, 4, 0] [1, 15, 0] -46.0
[1, 15, 0] (2, 3)
[1, 15, 0] [3, 1, 1] 31.0
[3, 1, 1] (2, 4)
[3, 1, 1] [4, 12, 1] -10.0
[4, 12, 1] (0, 4)
[4, 12, 1] [4, 14, 1] -1.0
[4, 14, 1] (0, 1)
[4, 14, 1] [1, 22, 1] 23.0
[1, 22, 1] (0, 4)
[1, 22, 1] [4, 2, 2] -11.0
[4, 2, 2] (1, 3)
[4, 2, 2] [3, 8, 2] -12.0
[3, 8, 2] (3, 4)
[3, 8, 2] [4, 14, 2] 24.0
[4, 14, 2] (4, 3)
[4, 14, 2] [3, 20, 2] 24.0
[3, 20, 2] (1, 4)
[3, 20, 2] [4, 0, 3] 7.0
[4, 0, 3] (2, 0)
[4, 0, 3] [0,

[2, 0, 1] (1, 0)
[2, 0, 1] [0, 13, 1] -2.0
[0, 13, 1] (2, 1)
[0, 13, 1] [1, 19, 1] -3.0
[1, 19, 1] (3, 2)
[1, 19, 1] [2, 1, 2] 6.0
[2, 1, 2] (3, 2)
[2, 1, 2] [2, 7, 2] -3.0
[2, 7, 2] (4, 0)
[2, 7, 2] [0, 14, 2] -8.0
[0, 14, 2] (0, 3)
[0, 14, 2] [3, 16, 2] 8.0
[3, 16, 2] (0, 3)
[3, 16, 2] [3, 20, 2] -2.0
[3, 20, 2] (1, 4)
[3, 20, 2] [4, 0, 3] 7.0
[4, 0, 3] (4, 3)
[4, 0, 3] [3, 2, 3] 8.0
[3, 2, 3] (1, 3)
[3, 2, 3] [3, 8, 3] -3.0
[3, 8, 3] (0, 0)
[3, 8, 3] [3, 9, 3] -5
[3, 9, 3] (2, 4)
[3, 9, 3] [4, 15, 3] -21.0
[4, 15, 3] (1, 4)
[4, 15, 3] [4, 15, 3] 0.0
[4, 15, 3] (3, 1)
[4, 15, 3] [1, 22, 3] -26.0
[1, 22, 3] (2, 0)
[1, 22, 3] [0, 11, 4] -20.0
[0, 11, 4] (4, 1)
[0, 11, 4] [1, 17, 4] 24.0
[1, 17, 4] (0, 3)
[1, 17, 4] [3, 21, 4] -11.0
[3, 21, 4] (2, 1)
[3, 21, 4] [1, 12, 5] 6.0
[1, 12, 5] (2, 0)
[1, 12, 5] [0, 18, 5] 6.0
[0, 18, 5] (0, 0)
[0, 18, 5] [0, 19, 5] -5
[0, 19, 5] (0, 0)
[0, 19, 5] [0, 20, 5] -5
[0, 20, 5] (4, 0)
[0, 20, 5] [0, 22, 5] -1.0
[0, 22, 5] (0, 0)
[0, 22, 5] [0, 23, 5]

[4, 3, 5] (2, 1)
[4, 3, 5] [1, 12, 5] 36.0
[1, 12, 5] (4, 2)
[1, 12, 5] [2, 12, 5] 0.0
[2, 12, 5] (3, 2)
[2, 12, 5] [2, 1, 6] -20.0
[2, 1, 6] (0, 4)
[2, 1, 6] [4, 11, 6] -23.0
[4, 11, 6] (4, 1)
[4, 11, 6] [1, 18, 6] 28.0
[1, 18, 6] (0, 3)
[1, 18, 6] [3, 23, 6] 2.0
[3, 23, 6] (4, 0)
[3, 23, 6] [0, 3, 0] -11.0
[0, 3, 0] (0, 0)
[0, 3, 0] [0, 4, 0] -5
[0, 4, 0] (4, 1)
[0, 4, 0] [1, 9, 0] 11.0
[1, 9, 0] (3, 4)
[1, 9, 0] [4, 17, 0] 23.0
[4, 17, 0] (4, 1)
[4, 17, 0] [1, 17, 0] 0.0
[1, 17, 0] (2, 1)
[1, 17, 0] [1, 23, 0] 15.0
[1, 23, 0] (0, 3)
[1, 23, 0] [3, 7, 1] 14.0
[3, 7, 1] (4, 3)
[3, 7, 1] [3, 20, 1] -11.0
[3, 20, 1] (3, 0)
[3, 20, 1] [0, 21, 1] 4.0
[0, 21, 1] (3, 0)
[0, 21, 1] [0, 23, 1] -1.0
[0, 23, 1] (4, 1)
[0, 23, 1] [1, 7, 2] -4.0
[1, 7, 2] (0, 0)
[1, 7, 2] [1, 8, 2] -5
[1, 8, 2] (1, 0)
[1, 8, 2] [0, 17, 2] 36.0
[0, 17, 2] (0, 3)
[0, 17, 2] [3, 19, 2] 8.0
[3, 19, 2] (4, 1)
[3, 19, 2] [1, 2, 3] -8.0
[1, 2, 3] (3, 1)
[1, 2, 3] [1, 8, 3] -3.0
[1, 8, 3] (0, 0)
[1, 8, 3] [1, 9, 3] -5
[1

[0, 15, 2] (2, 4)
[0, 15, 2] [4, 18, 2] 3.0
[4, 18, 2] (0, 2)
[4, 18, 2] [2, 4, 3] -23.0
[2, 4, 3] (2, 1)
[2, 4, 3] [1, 11, 3] 28.0
[1, 11, 3] (4, 2)
[1, 11, 3] [2, 18, 3] -26.0
[2, 18, 3] (0, 0)
[2, 18, 3] [2, 19, 3] -5
[2, 19, 3] (2, 0)
[2, 19, 3] [0, 0, 4] 20.0
[0, 0, 4] (0, 0)
[0, 0, 4] [0, 1, 4] -5
[0, 1, 4] (0, 0)
[0, 1, 4] [0, 2, 4] -5
[0, 2, 4] (2, 0)
[0, 2, 4] [0, 8, 4] -3.0
[0, 8, 4] (2, 0)
[0, 8, 4] [0, 16, 4] -13.0
[0, 16, 4] (0, 0)
[0, 16, 4] [0, 17, 4] -5
[0, 17, 4] (0, 0)
[0, 17, 4] [0, 18, 4] -5
[0, 18, 4] (1, 2)
[0, 18, 4] [2, 10, 5] 1.0
[2, 10, 5] (2, 1)
[2, 10, 5] [1, 21, 5] 44.0
[1, 21, 5] (4, 0)
[1, 21, 5] [0, 0, 6] -6.0
[0, 0, 6] (0, 0)
[0, 0, 6] [0, 1, 6] -5
[0, 1, 6] (0, 3)
[0, 1, 6] [3, 7, 6] 24.0
[3, 7, 6] (0, 0)
[3, 7, 6] [3, 8, 6] -5
[3, 8, 6] (2, 3)
[3, 8, 6] [3, 22, 6] 2.0
[3, 22, 6] (3, 0)
[3, 22, 6] [0, 1, 0] 12.0
[0, 1, 0] (3, 1)
[0, 1, 0] [1, 12, 0] -46.0
[1, 12, 0] (4, 1)
[1, 12, 0] [1, 12, 0] 0.0
[1, 12, 0] (2, 0)
[1, 12, 0] [0, 15, 0] 3.0
[0, 15, 0]

[0, 16, 2] (4, 2)
[0, 16, 2] [2, 19, 2] 3.0
[2, 19, 2] (3, 2)
[2, 19, 2] [2, 3, 3] -4.0
[2, 3, 3] (0, 0)
[2, 3, 3] [2, 4, 3] -5
[2, 4, 3] (1, 0)
[2, 4, 3] [0, 18, 3] -7.0
[0, 18, 3] (1, 2)
[0, 18, 3] [2, 4, 4] 22.0
[2, 4, 4] (4, 0)
[2, 4, 4] [0, 6, 4] -1.0
[0, 6, 4] (2, 3)
[0, 6, 4] [3, 14, 4] -13.0
[3, 14, 4] (0, 1)
[3, 14, 4] [1, 20, 4] -3.0
[1, 20, 4] (0, 1)
[1, 20, 4] [1, 3, 5] -35.0
[1, 3, 5] (3, 2)
[1, 3, 5] [2, 9, 5] -12.0
[2, 9, 5] (4, 3)
[2, 9, 5] [3, 23, 5] -16.0
[3, 23, 5] (0, 4)
[3, 23, 5] [4, 3, 6] -11.0
[4, 3, 6] (3, 0)
[4, 3, 6] [0, 11, 6] 14.0
[0, 11, 6] (3, 0)
[0, 11, 6] [0, 14, 6] -15.0
[0, 14, 6] (0, 0)
[0, 14, 6] [0, 15, 6] -5
[0, 15, 6] (0, 0)
[0, 15, 6] [0, 16, 6] -5
[0, 16, 6] (2, 4)
[0, 16, 6] [4, 21, 6] -7.0
[4, 21, 6] (3, 0)
[4, 21, 6] [0, 10, 0] 25.0
[0, 10, 0] (0, 0)
[0, 10, 0] [0, 11, 0] -5
[0, 11, 0] (2, 4)
[0, 11, 0] [4, 19, 0] -22.0
[4, 19, 0] (1, 0)
[4, 19, 0] [0, 1, 1] -12.0
[0, 1, 1] (0, 0)
[0, 1, 1] [0, 2, 1] -5
[0, 2, 1] (0, 0)
[0, 2, 1] [0, 3, 1] -

[2, 15, 2] (4, 3)
[2, 15, 2] [3, 23, 2] 14.0
[3, 23, 2] (3, 4)
[3, 23, 2] [4, 3, 3] 16.0
[4, 3, 3] (4, 3)
[4, 3, 3] [3, 5, 3] 8.0
[3, 5, 3] (4, 1)
[3, 5, 3] [1, 13, 3] 14.0
[1, 13, 3] (0, 0)
[1, 13, 3] [1, 14, 3] -5
[1, 14, 3] (1, 3)
[1, 14, 3] [3, 18, 3] 16.0
[3, 18, 3] (2, 1)
[3, 18, 3] [1, 2, 4] 32.0
[1, 2, 4] (4, 3)
[1, 2, 4] [3, 8, 4] -3.0
[3, 8, 4] (2, 4)
[3, 8, 4] [4, 19, 4] 17.0
[4, 19, 4] (2, 3)
[4, 19, 4] [3, 1, 5] 24.0
[3, 1, 5] (0, 0)
[3, 1, 5] [3, 2, 5] -5
[3, 2, 5] (2, 1)
[3, 2, 5] [1, 15, 5] -47.0
[1, 15, 5] (0, 4)
[1, 15, 5] [4, 20, 5] -16.0
[4, 20, 5] (0, 1)
[4, 20, 5] [1, 1, 6] 11.0
[1, 1, 6] (0, 1)
[1, 1, 6] [1, 13, 6] -6.0
[1, 13, 6] (4, 2)
[1, 13, 6] [2, 15, 6] -10.0
[2, 15, 6] (0, 0)
[2, 15, 6] [2, 16, 6] -5
[2, 16, 6] (0, 4)
[2, 16, 6] [4, 3, 0] 17.0
[4, 3, 0] (2, 0)
[4, 3, 0] [0, 6, 0] 3.0
[0, 6, 0] (0, 0)
[0, 6, 0] [0, 7, 0] -5
[0, 7, 0] (0, 0)
[0, 7, 0] [0, 8, 0] -5
[0, 8, 0] (1, 4)
[0, 8, 0] [4, 17, 0] -45.0
[4, 17, 0] (0, 3)
[4, 17, 0] [3, 20, 0] -15.0
[3, 2

KeyboardInterrupt: 

In [None]:
Time_matrix[0][4][0][0]

In [None]:
Time_matrix[4][2][1][0]

In [None]:
Time_matrix[2][0][2][0]

In [None]:
Time_matrix[0][2][4][0]

In [None]:
Time_matrix[2][1][6][0]

In [None]:
Time_matrix[1][0][16][0]

In [None]:
Time_matrix[0][3][20][0]

In [None]:
Time_matrix[3][2][20][0]

In [None]:
Time_matrix[2][3][23][0]

In [None]:
env.state_get_loc(state)

### Tracking Convergence

In [None]:
agent.states_tracked

In [None]:
plt.figure(0, figsize=(16,7))
#plt.subplot(221)
plt.title('state [2,4,6]  action index 2')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.plot(xaxis,np.asarray(agent.states_tracked))
plt.show()

In [None]:
num_hits

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

In [None]:
time = np.arange(0,2000)
epsilon = []
for i in range(0,2000):
    epsilon.append(0 + (1 - 0.00001) * np.exp(-0.0003*i))
    z = np.random.random()

In [None]:
time = np.arange(0,2000)
epsilon = []
epsilon_c = 1
for i in range(0,2000):
    epsilon.append(epsilon_c)
    epsilon_c = epsilon_c * 0.999

In [None]:
plt.plot(time, epsilon)
plt.show()