### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

In [2]:
#!pip install tqdm

In [3]:
from itertools import permutations
import time
from tqdm import tqdm

In [4]:
# Import the environment
from Env import CabDriver

In [5]:
import tensorflow as tf
print(tf.__version__)

2.7.0


#### Defining Time Matrix

In [6]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

In [7]:
env = CabDriver()

In [8]:
env.state_init

[3, 23, 3]

In [9]:
e = env.state_encod_arch1((1,13,4))
e

[0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0]

In [10]:
r = env.requests((1,13,4))
r

([10, 18, 3, 15, 4, 12, 16, 13, 5, 20, 17, 6, 11, 7, 8],
 [(2, 0),
  (4, 2),
  (0, 3),
  (3, 0),
  (0, 4),
  (2, 4),
  (3, 4),
  (3, 1),
  (1, 0),
  (4, 0),
  (4, 1),
  (1, 2),
  (2, 3),
  (1, 3),
  (1, 4),
  (0, 0)])

In [11]:
r = env.requests((4,13,4))
r

([2, 10, 16, 9, 6, 13, 8, 3, 19, 17, 5, 4, 14],
 [(0, 2),
  (2, 0),
  (3, 4),
  (2, 1),
  (1, 2),
  (3, 1),
  (1, 4),
  (0, 3),
  (4, 3),
  (4, 1),
  (1, 0),
  (0, 4),
  (3, 2),
  (0, 0)])

In [12]:
re = env.reward_func((1,14,2), (1,3), Time_matrix)
re

4.0

In [13]:
nextSt = env.next_state_func((1,2,3), (3,1), Time_matrix)
nextSt

((1, 8, 3), False, 6)

#### Tracking the state-action pairs for checking convergence


In [14]:
def initialize_tracking_states():
    state_action_pair = [((1, 0, 0), (1, 2)),
                         ((1, 1, 1), (4, 5)),
                         ((2, 2, 2), (1, 3)),
                         ((2, 2, 2), (3, 4)),
                         ((3, 3, 3), (3, 2)),
                         ((3, 3, 3), (4, 5)),
                         ((4, 4, 4), (4, 1)), 
                         ((4, 4, 4), (2, 1)),
                         ((5, 5, 5), (1, 2)),
                         ((5, 5, 5), (2, 3))]

    for st, ac in state_action_pair:
        if st not in tracked_states:
            tracked_states[st] = defaultdict()
        tracked_states[st][ac] = list()

In [15]:
def save_tracking_states():
    for state in States_track.keys():
        for action in States_track[state].keys():
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action].append(Q_dict[state][action])

In [16]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [17]:
len(np.zeros((32, 3)))

32

In [37]:
class DQNAgent:
    
    def __init__(self, state_size, action_size, discount_factor=0.95, learning_rate=0.01, 
                 epsilon=1, epsilon_delay=0.09, epsilon_min=0.01):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = discount_factor
        self.learning_rate = learning_rate
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_delay
        self.epsilon_min = epsilon_min
        
        self.batch_size = 32        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        
        # Hidden layers
        model.add(tf.keras.Input(shape=(self.state_size,)))
        model.add(Dense(self.batch_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(self.batch_size, activation='relu', kernel_initializer='he_uniform'))
        
        #Output layer: with size of action_size
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model

    def get_action(self, state, possible_action_index, actions, episode):
    # Write your code here:
    # get action from model using epsilon-greedy policy
    # Decay in ε after we generate each sample from the environment
    
        # epsilon decay
        self.epsilon = (max_epsilon - min_epsilon) * np.exp(self.epsilon_decay * episode)
        
        #Towards exploration
        if(np.random.rand() <= self.epsilon):
            return random.choice(actions)
        #Exploit the learned policy till now
        else:
            encoded_state = env.state_encod_arch1(state)
            encoded_state = encoded_state.reshape(1, self.state_size)
            q_value = self.model.predict(encoded_state)
            
            possible_q_value = [q_value[0][act] for act in possible_action_index]
            
            best_action_index = np.argmax(possible_q_value[0])
            best_action = actions[best_action_index]
            
            return best_action     

    def append_sample(self, state, action, reward, next_state, done):
    # Write your code here:
    # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state, done))
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        
        if len(self.memory) > self.batch_size:
            
            # sample minibatch from memory
            minibatch = random.sample(self.memory, self.batch_size)
            
            #initialise two matrices - update_input and update_output
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []
            
            #populate update_input, update_output and the lists->rewards, action and done
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = minibatch[i]
                update_input[i] = env.state_encod_arch1(state)
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)
                
            # 1. Predict the target from earlier model
            target = self.model.predict(update_input)
            
            # 2. Get the target for the Q-network
            target_qval = self.model.predict(update_output)
            
            #3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else:
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                    
            #model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs = 1, verbose=1)


    def save(self, name):
        self.model.save(name)

### DQN block

In [28]:
# Hyperparameters

LR = 0.01
GAMMA = 0.9

batch_size = 32

Episodes = 1000 

max_epsilon = 1
min_epsilon = 0.001
decay_rate = 0.0009        # epsilon decay rate

In [29]:
action_space, state_space, state = env.reset()

#Call the DQN agent
agent = DQNAgent(len(env.state_encod_arch1(state)), len(env.action_space), GAMMA, LR, 
                      max_epsilon, decay_rate, min_epsilon)

In [30]:
agent.action_size

21

In [40]:
mod = agent.build_model()

In [41]:
mod.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_27 (Dense)            (None, 32)                1184      
                                                                 
 dense_28 (Dense)            (None, 32)                1056      
                                                                 
 dense_29 (Dense)            (None, 21)                693       
                                                                 
Total params: 2,933
Trainable params: 2,933
Non-trainable params: 0
_________________________________________________________________


In [31]:
# Variable to track and plot later
total_reward_per_episode = []

In [32]:
start_time = time.time() 

rewards_per_episode = 0

for episode in range(Episodes):

    reward_per_episode = 0
    terminal_state = False
    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    env = CabDriver()   
    action_space, state_space, state = env.reset()

    #Call the DQN agent
    agent = DQNAgent(len(env.state_encod_arch1(state)), len(env.action_space), GAMMA, LR, 
                      max_epsilon, decay_rate, min_epsilon)
    
    while not terminal_state:
        
        # 1. Possible requests for the driver in his current state
        possible_request_index, requests = env.requests(state)
        
        # 2. Pick epsilon-greedy action from possible actions for the current state
        best_action = agent.get_action(state, possible_request_index, requests, episode)
        
        # 3. Evaluate the reward
        reward = env.reward_func(state, best_action, Time_matrix)
        
        # 4. Evaluate the next state after performing best action
        next_state, terminal_state, total_ridetime = env.next_state_func(state, best_action, Time_matrix)
        
        # 5. Append the experience to the memory
        agent.append_sample(state, best_action, reward, next_state, terminal_state)
        
        # 6. Train the model by calling function agent.train_model
        agent.train_model()
        
        state = next_state
        
        rewards_per_episode += reward
        
    total_reward_per_episode.append(rewards_per_episode)
        
        # 5. Keep a track of rewards, Q-values, loss

    if (episode + 1) % 100 == 0:
        print("episode {0}, reward {1}, memory_length {2}, epsilon {3}".format(episode,
                                                                         rewards_per_episode,
                                                                         len(agent.memory),
                                                                         agent.epsilon))

    if ((episode + 1) % 100) == 0:   
        save_tracking_states()

    if ((episode + 1) % 1000000) == 0:
        print('Processed %dM episodes'%((episode+1)/1000000))
    
elapsed_time = time.time() - start_time
print('Total Execution time: ', elapsed_time)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

### Tracking Convergence

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()