### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [3]:
Time_matrix

array([[[[ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         ...,
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.],
         [ 0.,  0.,  0., ...,  0.,  0.,  0.]],

        [[ 2.,  3.,  3., ...,  7.,  0.,  6.],
         [ 2.,  3.,  3., ...,  7.,  0.,  6.],
         [ 2.,  3.,  3., ...,  7.,  0.,  6.],
         ...,
         [ 2.,  3.,  6., ...,  7.,  4.,  2.],
         [ 2.,  3.,  6., ...,  7.,  4.,  2.],
         [ 2.,  3.,  6., ...,  7.,  4.,  2.]],

        [[ 2.,  6.,  5., ...,  3.,  7.,  7.],
         [ 2.,  6.,  5., ...,  3.,  7.,  7.],
         [ 2.,  6.,  5., ...,  3.,  7.,  7.],
         ...,
         [ 6.,  2.,  8., ...,  4.,  5.,  5.],
         [ 6.,  2.,  8., ...,  4.,  5.,  5.],
         [ 6.,  2.,  8., ...,  4.,  5.,  5.]],

        [[10.,  6.,  8., ...,  7.,  4.,  6.],
         [10.,  6.,  8., ...,  7.,  4.,  6.],
         [10.,  6.,  8., ...,  7

#### Tracking the state-action pairs for checking convergence


In [4]:
# Initialise Q_dictionary as 'Q_dict' and States_tracked as 'States_track' (for convergence)
# If a Q-dictionary does not exist, run the following cell to initialise an empty dictionary
Q_dict = collections.defaultdict(dict)

States_track = collections.defaultdict(dict)

print(len(Q_dict))
print(len(States_track))

0
0


In [5]:
# Initialise states to be tracked
def initialise_tracking_states():
    sample_q_values = [('8-x-x-x-x-x-x-x-3', (1, 7)),('x-5-x-x-x-8-x-x-x', (0, 1)),
                       ('6-x-7-x-x-x-x-x-x', (1, 1)),('x-x-x-x-6-x-7-x-x', (0, 1))]    #select any 4 Q-values
    for q_values in sample_q_values:
        state = q_values[0]
        action = q_values[1]
        States_track[state][action] = []    #this is an array which will have appended values of that state-action pair for every 2000th episode           

In [6]:
def save_tracking_states():
    for state in States_track.keys():
        for action in States_track[state].keys():
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action].append(Q_dict[state][action])

In [7]:
# Defining a function which will add new Q-values to the Q-dictionary. 
def add_to_dict(state):
    state1 = Q_state(state)
    
    valid_act = valid_actions(state)
    if state1 not in Q_dict.keys():
        for action in valid_act:
            Q_dict[state1][action]=0

In [8]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state-action and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.01      
        self.epsilon_max = 1
        self.epsilon = 1
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.01
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()


    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(action_size, activation='linear', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model



    def get_action(self, state, time):
        # Write your code here:
        # get action from model using epsilon-greedy policy
        # Decay in Îµ after we generate each sample from the environment
        epsilon = - 1/ (1 + np.exp((-time+7500000)/1700000)) + 1
        z = np.random.random()
        
        if z <= epsilon:
            possible_actions_index, actions = env.requests(state)
            return actions[(np.random.choice(len(actions)))] #explore
        else:
            #state = state.reshape(1, self.state_size)
            q_value = self.model.predict(state)
            print(q_value)
            return np.argmax(q_value[0])
        
    

    def append_sample(self, state, action, reward, next_state, done):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))        
    

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        """
        train the neural network on a minibatch. Input to the network is the states,
        output is the target q-value corresponding to each action.
        """

        if len(self.memory) > self.batch_size:
            
            # sample minibatch from memory
            minibatch = random.sample(self.memory, self.batch_size)

            # initialise two matrices - update_input and update_output
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = minibatch[i]
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # predict the target q-values from states s
            target = self.model.predict(update_input)

            # target for q-network
            target_qval = self.model.predict(update_output)

            # update the target values
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])

            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)

    def save(self, name):
        self.model.save_weights(name)

In [10]:
#Car's battery can hold the charge for 30 days 
#and pick up decision can be made only once per hour
#each episode is 30 days long; i.e., terminal state reached at the end of 24*30 hrs
Episodes = 3 #for testing

threshold = 1000
policy_threshold = 1000    #every these many episodes, the Q-dict will be updated

In [11]:
#initialise_tracking_states()

### DQN block

In [12]:
for episode in range(Episodes):

    # Write code here
    print(episode)
    done = False
    score = 0
    reward = 0
    time_steps = 24*30 #duration of each episode
    # Call the environment
    env = CabDriver()
    # Call all the initialised variables of the environment
    action_space = env.action_space
    print(action_space)
    state_space = env.state_space
    
    action_size = len(env.action_space)
    state_size = len(env.state_space)
    
    curr_state = env.state_init #random state at episode start
    print("current state is:")
    print(curr_state)
    #Call the DQN agent
    agent = DQNAgent(action_size=action_size, state_size=state_size)
    
    while not done:
        
        # Write your code here
        print(time_steps)
        # 1. Pick epsilon-greedy action from possible actions for the current state
        action = agent.get_action(curr_state, episode)
        print("action is:")
        print(action)
        
        #select a request and drop the customer to the requested locn
        next_state = env.next_state_func(curr_state, action, Time_matrix)        
        print("next state is:")
        print(next_state)
        
        # 2. Evaluate your reward and next state
        reward = env.reward_func(curr_state, action, Time_matrix)
        
        # 3. Append the experience to the memory
        agent.append_sample(curr_state, action, reward, next_state, done)
        
        # 4. Train the model by calling function agent.train_model
        agent.train_model()
        
        time_steps = time_steps - 1
        if time_steps == 0:
            done = true
            
        score += reward
        curr_state = next_state
        
    # 5. Keep a track of rewards, Q-values, loss
    rewards_per_episode.append(score)
    episodes.append(episode)        
        
    # every episode:
    print("episode {0}, reward {1}, memory_length {2}, epsilon {3}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon))
    # every few episodes:
    if episode % 10 == 0:
        # store q-values of some prespecified state-action pairs
        # q_dict = agent.store_q_values()

        # save model weights
        agent.save_model_weights(name="model_weights.h5")
        
    #TRACKING Q-VALUES
    #if (episode == threshold-1):        #at the 999th episode
    #    initialise_tracking_states()
      
    #if ((episode+1) % threshold) == 0:   #every 1000th episode
    #    save_tracking_states()
    #    save_obj(States_track,'States_tracked')   
    
    #SAVING POLICY
    #if ((episode+1)% policy_threshold ) == 0:  #every 1000th episodes, the Q-dict will be saved
    #    save_obj(Q_dict,'Policy')          
    
#save_obj(States_track,'States_tracked')   
#save_obj(Q_dict,'Policy')    

0
[(0, 1), (0, 2), (0, 3), (0, 4), (1, 0), (1, 2), (1, 3), (1, 4), (2, 0), (2, 1), (2, 3), (2, 4), (3, 0), (3, 1), (3, 2), (3, 4), (4, 0), (4, 1), (4, 2), (4, 3), (0, 0)]
current state is:
(3, 14, 0)
720
action is:
(3, 4)
3
3
14
0
next state is:
(4, 23, 0)
719
action is:
(2, 1)
4
2
23
0
next state is:
(1, 4, 1)
718
action is:
(1, 3)
1
1
4
1
next state is:
(3, 10, 1)
717
action is:
(2, 4)
3
2
10
1
next state is:
(4, 20, 1)
716
action is:
(1, 4)
4
1
20
1
next state is:
(4, 20, 1)
715
action is:
(2, 1)
4
2
20
1
next state is:
(1, 9, 2)
714
action is:
(4, 2)
1
4
9
2
next state is:
(2, 20, 2)
713
action is:
(4, 0)
2
4
20
2
next state is:
(0, 5, 3)
712
action is:
(0, 4)
0
0
5
3
next state is:
(4, 5, 3)
711
action is:
(0, 3)
4
0
5
3
next state is:
(3, 10, 3)
710
action is:
(1, 4)
3
1
10
3
next state is:
(4, 19, 3)
709
action is:
(4, 3)
4
4
19
3
next state is:
(3, 1, 4)
708
action is:
(4, 2)
3
4
1
4
next state is:
(2, 5, 4)
707
action is:
(4, 0)
2
4
5
4
next state is:
(0, 7, 4)
706
action is:


AttributeError: 'tuple' object has no attribute 'ndim'

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()