### Cab-Driver Agent

#### Defining Time Matrix

In [None]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import pylab
import os

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

In [None]:
# Import the environment
from Env import CabDriver

# create an environment object 
env = CabDriver()

In [None]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Tracking the state-action pairs for checking convergence


In [None]:
# Creating directories for saving the model and graph if the directories doesn't exist
directory_model = 'save_model'
if not os.path.exists(directory_model):
    os.makedirs(directory_model)

In [None]:
def Q_state(state):
    str_state = ""
    for s in state:
        if s < 10:
            str_state = str_state + "0" + str(s)
        else:
            str_state = str_state + str(s)
    return str_state

In [None]:
# Defining a function which will add new Q-values to the Q-dictionary. 
def add_to_dict(state, q_val, action, epi):
    state1 = Q_state(state)
    
    if action not in Q_dict[state1]:
        Q_dict[state1][action]= []    
    Q_dict[state1][action].append((epi, q_val))

In [None]:
# Function to save q-value for tracking states-action pair
def save_tracking_states():
    initialise_tracking_states()

    for state in States_track.keys():
        for action in States_track[state].keys():
            print(state, action)
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action] += [q[1] for q in Q_dict[state][action]]
    print(States_track)

In [None]:
# Function to save q-value for tracking states-action pair
def initialise_tracking_states():
    sample_q_values = [('020405', (2, 4)), 
                       ('031006', (3, 5)), 
                       ('040502', (4, 3)), 
                       ('011402', (1, 3))]    #select any 4 Q-values
    for q_values in sample_q_values:
        state = q_values[0]
        action = q_values[1]
        States_track[state][action] = []    #this is an array which will have appended values of that state-action pair for every 2000th episode         

In [None]:
# Defining a function to save the Q-dictionary as a pickle file
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [None]:
# Initialise Q_dictionary and States_tracked (for convergence)
Q_dict = collections.defaultdict(dict)
States_track = collections.defaultdict(dict)

initialise_tracking_states()

### Hyperparameters

In [None]:
#Defining parameters for the experiment

Episodes = 1000
AN_EPISOD_DAYS = 30                
LR = 0.001                   #learning rate
GAMMA = 0.91


threshold = 100            #every these many episodes, the 4 Q-values will be stored/appended (convergence graphs)
policy_threshold = 400    #every these many episodes,

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [None]:
class DQNAgent:
    
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = GAMMA
        self.learning_rate =  LR     
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 32 
        self.train_start = 100
        
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # ++ if model weights are available load it from file 
        #self.load_model_weights = load_weights
        #if self.load_model_weights:
        #    self.model.load_weights("./cardriver_dqn.h5")
            
    def save_model_graph(self, suffix):
        # serialize model to JSON
        model_json = self.model.to_json()
        with open("./" + directory_model + "/cardriver_model_" + suffix + ".json", "w") as json_file:
            json_file.write(model_json)

    # approximate Q function using Neural Network
    def build_model(self):
        # Write your code here: Add layers to your neural nets       
        model = Sequential()

        # Input Layer: 5 locations + 24 hours + 7 days = 36
        # state [(1,0,0,0,0)(0 0 1 0 0 0  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0)(1 0 0 0 0 0 0)]
        
        # Hidden layers: 1, 2, 3, 4 
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))

        # Output Layer [Q(s,a1), Q(s,a2), ...... Q(s,ak)], k=21        
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary()
        return model

    
    def update_target_model(self):
        # update the target Q-value network to current Q-value network after training for a episode. 
        # this means that weights an biases of target Q-value network will become same as current Q-value network.
        self.target_model.set_weights(self.model.get_weights())
    
    
#     def get_action(self, state):
#         # Write your code here:
#         # get action from model using epsilon-greedy policy
#         # Decay in ε after we generate each sample from the environment       
        
#         q_value = self.model.predict(state)
#         if np.random.rand() <= self.epsilon:
#             # explore: choose a random action from all possible actions
#             # in case of cartpole this will randomly choose an action between 0 and 1
#             index = random.randrange(self.action_size)
#         else:
#             # choose the action with the highest q(s, a)
#             # the first index corresponds to the batch size, so
#             index = np.argmax(q_value[0])
        
#         return index, q_value[0][index]
    
    def get_action(self, state, possible_actions):
        q_values = self.model.predict(state)
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from all possible actions
            # possible actions list would be given by environment
            choice = random.choice(possible_actions)
        else:
            # choose the action with the highest q(s, a)
            #q_values = self.model.predict(state)
            possible_action_q_values = [q_values[0][index] for index in possible_actions]
            max_q_value = np.max(possible_action_q_values)
            choice = list(q_values[0]).index(max_q_value)

        return choice, q_values[0][choice]

    
    def append_sample(self, state, action, reward, next_state, done):
        # Write your code here:
        # save sample <s,a,r,s'> to the replay memory
        # append the tuple (s, a, r, s', done) to memory (replay buffer) after every action
        self.memory.append((state, action, reward, next_state, done))
    
        # Decay in ε after we generate each sample from the environment
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay    
    
    
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        """
        Complete the 'train_model()' function with following logic:
        
        If the memory size is greater than mini-batch size, you randomly sample experiences 
        from memory as per the mini-batch size and do the following:
        
            1. Initialise your input and output batch for training the model
            2. Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
            3. Get Q(s', a) values from the last trained model
            4. Update the input batch as your encoded state and output batch as your Q-values
            5. Then fit your DQN model using the updated input and output batch.        
        """
        if len(self.memory) < self.train_start:
            return
        
        # Sample batch from the memory
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, self.batch_size)

        # 1. Initialise your input and output batch for training the model
        update_input = np.zeros((self.batch_size, self.state_size))
        update_target = np.zeros((self.batch_size, self.state_size)) 
        actions, rewards, terminal_states = [], [], []

        for i in range(self.batch_size):
            # Write your code from here

            update_input[i] = mini_batch[i][0]
            actions.append(mini_batch[i][1])
            rewards.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            terminal_states.append(mini_batch[i][4])

        # 1. Predict the target from earlier model
        target = self.model.predict(update_input)

        # 2. Get the target for the Q-network
        target_qval = self.target_model.predict(update_target)  

        for i in range(self.batch_size):
            if terminal_states[i]:
                target[i][actions[i]] = rewards[i]
            else: # non-terminal state
                target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])

        # 4. Fit your model and track the loss values
        self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)

    
    def get_model_weights(self):
        return self.model.get_weights()
    
    
    def save(self, name):
        self.model.save_weights(name)

### DQN block

In [None]:
scores, episodes = [], []
state = env.state_init
state_encoded = env.state_encod_arch1(state)
state_size = len(state_encoded)
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)

for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    
    terminal_state = False
    score = 0
    reward = 0
    total_days = 1
    previous_day = 0
    current_day = 0
    env.reset()
    state = env.state_init
    
    #Call the DQN agent
                       
    while not terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        
        state_encoded = env.state_encod_arch1(state)
        state_encoded = np.reshape(state_encoded, [1, state_size])
        
        # get possible list of actions from the environment
        possible_actions, action_list = env.requests(state)
        
        action, q_val = agent.get_action(state_encoded, possible_actions)
        
        reward = env.reward_func(state, env.action_space[action], Time_matrix)
        next_state = env.next_state_func(state, env.action_space[action], Time_matrix)
        next_state_encoded = env.state_encod_arch1(next_state)
        next_state_encoded = np.reshape(next_state_encoded, [1, state_size])
        
        agent.append_sample(state_encoded, action, reward, next_state_encoded, terminal_state)
        
        add_to_dict(state, q_val, env.action_space[action], episode)
        
        agent.train_model()
        
        score += reward
        previous_day = state[2]
        state = next_state
        current_day = state[2]
        if previous_day != current_day:
            total_days = total_days + 1
            
        if total_days > AN_EPISOD_DAYS:
            terminal_state = True
            
        if terminal_state == True:
            agent.update_target_model()
            scores.append(score)
            episodes.append(episode)  
    
    print("Episode:", episode, "  score:", score, "  memory length:", len(agent.memory), "  epsilon:", agent.epsilon)

    #TRACKING Q-VALUES
    if ((episode+1) % threshold) == 0:   #every threshold episode
        save_tracking_states()
        save_obj(States_track,'States_tracked')

    
    # plot episode versus score 
    if episode % threshold == 0: 
        pylab.plot(episodes, scores)
        pylab.savefig("./" + directory_model + "/cardriver_dqn_" + str(score) + ".png")    
    
    #save the model 
    if episode % policy_threshold == 0:         
        # Model binary h5 file
        agent.save("./" + directory_model + "/cardriver_dqn_" + str(score) + ".h5")
        
        # Model pickle file
        save_obj(agent.get_model_weights(), "./" + directory_model + "/cardriver_dqn_" + str(score))
    
    #SAVING POLICY
    if ((episode+1)% policy_threshold ) == 0:  #every policy_threshold episodes, the Q-dict will be saved
        save_obj(Q_dict,'Policy')  
        

### Tracking Convergence

In [None]:
Q_dict


In [None]:
# Q-Values for state-action pairs is obtained for diffrent episodes
#------------------------------------------------------------------
# Location C for 17:00 hours on TUESDAY
# result dictionary contains: 
# {(Action-Pair): [(Episode1, Q(s,a)), (Episode2, Q(s,a)),....]} 

Q_dict["031701"]

In [None]:
# Q Value Tracking Location C for 17:00 hours on TUESDAY for action location C->D
# result dictionary contains: 
# [(Episode_ID: Q(s,a)), ...]} 

print("All actions taken from state 031701 : {}\n".format(Q_dict["031701"].keys()))

print("All Q-values for action (3, 4) :  (EpisodeID, Q-value)")
if (3, 4) in Q_dict["031701"]:
    print(Q_dict["031701"][(3, 4)])


In [None]:
# Convergence graphs: Total Q-value per episode

pylab.plot(episodes, scores)

In [None]:
# TRACKING Q-VALUES FOR DEFINED STATE & ACTION
States_track

In [None]:
# Q value for state ["031006"] and action [(3, 5)]
Q_dict["031006"][(3, 5)]

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()