### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Flatten, Activation, Dense
from tensorflow.keras.optimizers import Adam

import matplotlib.pyplot as plt
%matplotlib inline

# Import the environment
from Env import CabDriver

After reset (1, 15, 2)


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [3]:
states_dict = collections.defaultdict(dict)

In [4]:
# Initialise the states to be tracked
def initialise_states_dict():
    sample_q_vals = [((0, 0, 0), (0, 2)), ((0, 0, 0), (0, 3))]    
    for q_vals in sample_q_vals:
        state = q_vals[0]
        action = q_vals[1]
        states_dict[state][action] = []

In [5]:
#Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [6]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.001
        self.epsilon_max = self.epsilon = 1
        self.epsilon_decay = 0.0003
        self.epsilon_min = 0.00001

        self.batch_size = 32
        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)
        
        # Initialize the value of the states tracked
        self.states_tracked = []
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        model = Sequential()
        # Write your code here: Add layers to your neural nets   
        
        # input layer
        model.add(Dense(32, input_dim=self.state_size, activation='relu',kernel_initializer='he_uniform'))
        
        # layer - 2
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        
        # layer - 3
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # output layer
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))

        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        model.summary
        return model

    def get_action(self, state, action_space, possible_actions):
        # Write your code here:
        # get action from model using epsilon-greedy policy
        # Decay in ε after we generate each sample from the environment
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from all possible actions
            index = random.randrange(len(possible_actions))
            action_index = possible_actions[index]
            action = action_space[action_index]
            return action_index, action

        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so reshape state to (1, state_size) so that the first index corresponds to the batch size

            state = np.array(state).reshape(1, self.state_size)
            q_val = self.model.predict(state)

            return np.argmax(q_val[0]), action_space[np.argmax(q_val[0])]

    def append_sample(self, state, action, reward, next_state):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        self.memory.append((state, action, reward, next_state))

    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):

        if len(self.memory) > self.batch_size:
           
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            update_output = np.zeros((self.batch_size, self.state_size))  
            update_input = np.zeros((self.batch_size, self.state_size))

            actions, rewards = [], []

            for i in range(self.batch_size):
                state, action, reward, next_state = mini_batch[i]
                update_input[i] = state
                actions.append(action)
                rewards.append(reward)
                update_output[i] = next_state

                # Write your code from here
                # 1. Predict the target from earlier model
                target = self.model.predict(update_input)

                # 2. Get the target for the Q-network
                target_qval = self.model.predict(update_output)

                # 3. Update your 'update_output' and 'update_input' batch
            for i in range(self.batch_size):
                target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
                
            # 4. Fit your model and track the loss values
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
    def save(self, name):
        self.model.save(name)         
            
    def store_q_vals(self):
        q_val = self.model.predict(self.track_state)
        self.states_tracked.append(q_val[0][2])

In [7]:
Episodes = 20000

### DQN block

In [8]:

start_time = time.time()
rewards_per_episode, episodes, avg_rewards_per_episode = [], [], []

env = CabDriver()
agent = DQNAgent(action_size=len(env.action_space), state_size=len(env.state_encod_arch1(
    env.state_init)))

for episode in range(Episodes):

    # Write code here
    # Call the environment
    env = CabDriver()
    # Call all the initialised variables of the environment
    score = 0
    total_time = 0
    terminal_state = False
    action_space, state_space, state = env.reset()
       
    while  not terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        possible_actions_index, actions = env.requests(state)
        action_index, action = agent.get_action(env.state_encod_arch1(state), env.action_space, possible_actions_index)
        
        # 2. Evaluate your reward and next state
        next_state, wait_time, transit_time, ride_time = env.next_state_func(state, action, Time_matrix)
        reward = env.reward_func(state, action, Time_matrix)
        
        # 3. Append the experience to the memory
        agent.append_sample(env.state_encod_arch1(state), action_index, reward, env.state_encod_arch1(next_state))
        
        # 4. Train the model by calling function agent.train_model
        agent.train_model()
        
        # 5. Keep a track of rewards, Q-values, loss
        score += reward
        state = next_state
        total_time += wait_time + transit_time + ride_time
        
        if total_time >= 24 * 30:
            print("state terminated")
            terminal_state = True

     # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)

    # epsilon decay
    if agent.epsilon > agent.epsilon_min:
        agent.epsilon = agent.epsilon_min + (agent.epsilon_max - agent.epsilon_min) * np.exp(-agent.epsilon_decay*episode)

    # every episode
    print("1111111")
    print("episode {0}, reward {1}, memory_length {2}, epsilon {3}".format(episode, score, len(agent.memory), agent.epsilon))
    
    # every 10 episodes:
    if episode % 10 == 0:
        agent.store_q_vals()  # store q-values of some prespecified state-action pairs
    if episode % 1000 == 0:
        agent.save(name="model.pkl")
        total_time = time.time() - start_time
        print('Total time :',total_time)  

After reset (2, 16, 3)
After reset (5, 11, 2)
After reset (2, 5, 4)
Next state :  0 2 0 5 4
Next state :  0 2 0 5 4
Next state :  0 4 3 8 4
Next state :  0 4 3 8 4
Next state :  0 0 1 17 4
Next state :  0 0 1 17 4
Next state :  0 3 0 8 5
Next state :  0 3 0 8 5
Next state :  0 4 1 14 5
Next state :  0 4 1 14 5
Next state :  0 0 1 19 5
Next state :  0 0 1 19 5
Next state :  0 0 2 5 6
Next state :  0 0 2 5 6
Next state :  0 1 2 14 6
Next state :  0 1 2 14 6
Next state :  0 0 1 19 6
Next state :  0 0 1 19 6
Next state :  0 2 0 6 0
Next state :  0 2 0 6 0
Next state :  0 4 2 15 0
Next state :  0 4 2 15 0
Next state :  0 0 3 19 0
Next state :  0 0 3 19 0
Next state :  0 3 0 5 1
Next state :  0 3 0 5 1
Next state :  0 3 0 14 1
Next state :  0 3 0 14 1
Next state :  0 4 0 16 1
Next state :  0 4 0 16 1
Next state :  0 4 1 19 1
Next state :  0 4 1 19 1
Next state :  0 3 0 21 1
Next state :  0 3 0 21 1
Next state :  0 2 1 1 2
Next state :  0 2 1 1 2
Next state :  0 2 3 10 2
Next state :  0 2 3 1

KeyboardInterrupt: 

In [None]:
total_time = time.time() - start_time
print('Total time : ',total_time)

### Tracking Convergence

In [None]:
# Check Convergence
plt.plot(list(range(len(rewards_per_episode))), rewards_per_episode)
plt.ylabel("Total Rewards")
plt.show()

In [None]:
# Average rewards per 100 episodes
avg_rewards = []
episodes = len(rewards_per_episode)
index = 0
track_total_reward = 0
for episode_number in range(episodes):
    if index != 100:
        track_total_reward += rewards_per_episode[episode_number]
        index += 1
    else:
        avg_rewards.append(track_total_reward / index)
        track_total_reward = rewards_per_episode[episode_number]
        index = 1

avg_rewards.append(track_total_reward / index)   
print(avg_rewards)

In [None]:
# Check Convergence by tracking average rewards per episode vs episode number
plt.plot(list(range(len(avg_rewards))), avg_rewards)
plt.ylabel("avg rewards")
plt.show()

In [None]:
plt.figure(0, figsize=(10,6))
plt.title('Q_value for state [0,0,0] and action (0,2)')
xaxis = np.asarray(range(0, len(agent.states_tracked)))
plt.semilogy(xaxis,np.asarray(agent.states_tracked))
plt.show()

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()