### Cab-Driver Agent

In [1]:
# Importing libraries
import numpy as np
import random
import math
from collections import deque
import collections
import pickle
import time

# for building DQN model
from keras import layers
from keras import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

# for plotting graphs
import matplotlib.pyplot as plt

# Import the environment
from Env import CabDriver

Using TensorFlow backend.


#### Defining Time Matrix

In [2]:
# Loading the time matrix provided
Time_matrix = np.load("TM.npy")

#### Check what the max, min and mean time values are. This will help us in defining the 'next_step' function in the Environment.

In [3]:
print(type(Time_matrix))
print(Time_matrix.max())
print(Time_matrix.min())
print(Time_matrix.mean())
print(Time_matrix.var())

<class 'numpy.ndarray'>
11.0
0.0
3.0542857142857143
7.93705306122449


#### Since the max time is 11 hours between any 2 points, the next state of the cab driver may increase at most by  1 day.

### Agent Class

If you are using this framework, you need to fill the following to complete the following code block:
1. State and Action Size
2. Hyperparameters
3. Create a neural-network model in function 'build_model()'
4. Define epsilon-greedy strategy in function 'get_action()'
5. Complete the function 'append_sample()'. This function appends the recent experience tuple <state, action, reward, new-state> to the memory
6. Complete the 'train_model()' function with following logic:
   - If the memory size is greater than mini-batch size, you randomly sample experiences from memory as per the mini-batch size and do the following:
      - Initialise your input and output batch for training the model
      - Calculate the target Q value for each sample: reward + gamma*max(Q(s'a,))
      - Get Q(s', a) values from the last trained model
      - Update the input batch as your encoded state and output batch as your Q-values
      - Then fit your DQN model using the updated input and output batch.

In [4]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        # Define size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # Write here: Specify you hyper parameters for the DQN
        self.discount_factor = 0.95
        self.learning_rate = 0.06 # 0.06 after fix was better
        self.epsilon = 1
        self.epsilon_max = 1
        self.epsilon_decay = -0.0007 #for 3k
        self.epsilon_min = 0.00001
        
        self.batch_size = 32

        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # Initialize the value of the states tracked
        self.states_tracked = []
        
        # We are going to track state [0,0,0] and action () at index 2 in the action space.
        self.track_state = np.array(env.state_encod_arch1([0,0,0])).reshape(1, 36)

        # create main model and target model
        self.model = self.build_model()

    # approximate Q function using Neural Network
    def build_model(self):
        """
        Function that takes in the agent and constructs the network
        to train it
        @return model
        @params agent
        """
        input_shape = self.state_size
        model = Sequential()
        # Write your code here: Add layers to your neural nets       
        model.add(Dense(32, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform'))
        model.add(Dense(32, activation='relu', kernel_initializer='he_uniform'))
        # the output layer: output is of size num_actions
        model.add(Dense(self.action_size, activation='relu', kernel_initializer='he_uniform'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        model.summary
        return model

    def get_action(self, state, possible_actions_index, actions):
        """
        get action in a state according to an epsilon-greedy approach
        possible_actions_index, actions are the 'ride requests' that teh driver got.
        """        
        # get action from model using epsilon-greedy policy
        # Decay in ε after each episode       
        if np.random.rand() <= self.epsilon:
            # explore: choose a random action from the ride requests
            return random.choice(possible_actions_index)
        else:
            # choose the action with the highest q(s, a)
            # the first index corresponds to the batch size, so
            # reshape state to (1, state_size) so that the first index corresponds to the batch size
            state = np.array(env.state_encod_arch1(state)).reshape(1, 36)

            # Use the model to predict the Q_values.
            q_value = self.model.predict(state)

            # truncate the array to only those actions that are part of the ride  requests.
            q_vals_possible = [q_value[0][i] for i in possible_actions_index]

            return possible_actions_index[np.argmax(q_vals_possible)]

    def append_sample(self, state, action_index, reward, next_state, done):
        """appends the new agent run output to replay buffer"""
        self.memory.append((state, action_index, reward, next_state, done))
        
    # pick samples randomly from replay memory (with batch_size) and train the network
    def train_model(self):
        """ 
        Function to train the model on eacg step run.
        Picks the random memory events according to batch size and 
        runs it through the network to train it.
        """
        if len(self.memory) > self.batch_size:
            # Sample batch from the memory
            mini_batch = random.sample(self.memory, self.batch_size)
            # initialise two matrices - update_input and update_output
            update_input = np.zeros((self.batch_size, self.state_size))
            update_output = np.zeros((self.batch_size, self.state_size))
            actions, rewards, done = [], [], []

            # populate update_input and update_output and the lists rewards, actions, done
            for i in range(self.batch_size):
                state, action, reward, next_state, done_boolean = mini_batch[i]
                update_input[i] = env.state_encod_arch1(state)     
                actions.append(action)
                rewards.append(reward)
                update_output[i] = env.state_encod_arch1(next_state)
                done.append(done_boolean)

            # predict the target q-values from states s
            target = self.model.predict(update_input)
            # target for q-network
            target_qval = self.model.predict(update_output)


            # update the target values
            for i in range(self.batch_size):
                if done[i]:
                    target[i][actions[i]] = rewards[i]
                else: # non-terminal state
                    target[i][actions[i]] = rewards[i] + self.discount_factor * np.max(target_qval[i])
            # model fit
            self.model.fit(update_input, target, batch_size=self.batch_size, epochs=1, verbose=0)
            
    def save_tracking_states(self):
        # Use the model to predict the q_value of the state we are tacking.
        q_value = self.model.predict(self.track_state)
        
        # Grab the q_value of the action index that we are tracking.
        self.states_tracked.append(q_value[0][2])

    def save(self, name):
        with open(name, 'wb') as file:  
            pickle.dump(self.model, file,pickle.HIGHEST_PROTOCOL)

### DQN block

for episode in range(Episodes):

    # Write code here
    # Call the environment
    # Call all the initialised variables of the environment
    

    #Call the DQN agent
    
    
    while !terminal_state:
        
        # Write your code here
        # 1. Pick epsilon-greedy action from possible actions for the current state
        # 2. Evaluate your reward and next state
        # 3. Append the experience to the memory
        # 4. Train the model by calling function agent.train_model
        # 5. Keep a track of rewards, Q-values, loss
        

In [None]:
episode_time = 24*30 #30 days before which car has to be recharged
n_episodes = 3000
m = 5
t = 24
d = 7

# Invoke Env class
env = CabDriver()
action_space, state_space, state = env.reset()

# Set up state and action sizes.
state_size = m+t+d
action_size = len(action_space)

# Invoke agent class
agent = DQNAgent(action_size=action_size, state_size=state_size)

# to store rewards in each episode
rewards_per_episode, episodes = [], []
# Rewards for state [0,0,0] being tracked.
rewards_init_state = []

### Run the episodes build up replay buffer and train the model.

In [None]:
start_time = time.time()
score_tracked = []

for episode in range(n_episodes):

    done = False
    score = 0
    track_reward = False
    # reset at the start of each episode
    env = CabDriver()
    action_space, state_space, state = env.reset()
    # Save the initial state so that reward can be tracked if initial state is [0,0,0]
    initial_state = env.state_init
    if (initial_state == [0,0,0]):
        track_reward = True

    total_time = 0  # Total time driver rode in this episode
    while not done:
        # 1. Get a list of the ride requests driver got.
        possible_actions_indices, actions = env.requests(state)
        # 2. Pick epsilon-greedy action from possible actions for the current state.
        action = agent.get_action(state, possible_actions_indices, actions)

        # 3. Evaluate your reward and next state
        reward, next_state, step_time = env.step(state, env.action_space[action], Time_matrix)

        # 4. Total time driver rode in this episode
        total_time += step_time
        if (total_time > episode_time):
            # if ride does not complete in stipu;ated time skip
            # it and move to next episode.
            done = True
        else:
            # 5. Append the experience to the memory
            agent.append_sample(state, action, reward, next_state, done)
            # 6. Train the model by calling function agent.train_model
            agent.train_model()
            # 7. Keep a track of rewards, Q-values, loss
            score += reward
            state = next_state

    # store total reward obtained in this episode
    rewards_per_episode.append(score)
    episodes.append(episode)
    
    if (track_reward == True):
        # Track the reward separately for the state [0,0,0]
        rewards_init_state.append(score)

    # epsilon decay
    agent.epsilon = (1 - 0.00001) * np.exp(agent.epsilon_decay * episode)

    # every 1o episodes:
    if (episode % 10 == 0):
        print("episode {0}, reward {1}, memory_length {2}, epsilon {3} total_time {4}".format(episode,
                                                                         score,
                                                                         len(agent.memory),
                                                                         agent.epsilon, total_time))
    # Save the Q_value of the state, action pair we are tracking
    agent.save_tracking_states()
    score_tracked.append(score) 

    if(episode % 500 == 0):
        print("Saving Model {}".format(episode))
        agent.save(name="model_weights.pkl")
    
elapsed_time = time.time() - start_time
print(elapsed_time)


episode 0, reward -130.0, memory_length 122, epsilon 0.99999 total_time 721.0
Saving Model 0
episode 10, reward -120.0, memory_length 1505, epsilon 0.9930145126888058 total_time 725.0
episode 20, reward -318.0, memory_length 2000, epsilon 0.9860876832874194 total_time 723.0
episode 30, reward -160.0, memory_length 2000, epsilon 0.9792091723798139 total_time 730.0
episode 40, reward 27.0, memory_length 2000, epsilon 0.9723786429175789 total_time 723.0
episode 50, reward -323.0, memory_length 2000, epsilon 0.965595760203404 total_time 728.0
episode 60, reward -123.0, memory_length 2000, epsilon 0.9588601918746789 total_time 721.0
episode 70, reward -43.0, memory_length 2000, epsilon 0.9521716078872079 total_time 723.0
episode 80, reward -246.0, memory_length 2000, epsilon 0.9455296804990374 total_time 727.0
episode 90, reward -213.0, memory_length 2000, epsilon 0.9389340842543964 total_time 721.0
episode 100, reward -354.0, memory_length 2000, epsilon 0.9323844959677493 total_time 721.0


episode 900, reward 194.0, memory_length 2000, epsilon 0.5325864750888871 total_time 729.0
episode 910, reward 240.0, memory_length 2000, epsilon 0.5288713877389174 total_time 725.0
episode 920, reward -162.0, memory_length 2000, epsilon 0.5251822151927655 total_time 723.0
episode 930, reward 109.0, memory_length 2000, epsilon 0.5215187766802384 total_time 728.0
episode 940, reward 125.0, memory_length 2000, epsilon 0.5178808926921159 total_time 723.0
episode 950, reward -155.0, memory_length 2000, epsilon 0.5142683849713549 total_time 721.0
episode 960, reward 59.0, memory_length 2000, epsilon 0.5106810765043542 total_time 729.0
episode 970, reward 397.0, memory_length 2000, epsilon 0.5071187915122811 total_time 728.0
episode 980, reward 17.0, memory_length 2000, epsilon 0.5035813554424584 total_time 731.0
episode 990, reward 122.0, memory_length 2000, epsilon 0.5000685949598108 total_time 727.0
episode 1000, reward 360.0, memory_length 2000, epsilon 0.49658033793837164 total_time 724

episode 1790, reward 168.0, memory_length 2000, epsilon 0.2856437139871857 total_time 723.0
episode 1800, reward 182.0, memory_length 2000, epsilon 0.2836511899595054 total_time 731.0
episode 1810, reward 288.0, memory_length 2000, epsilon 0.28167256489688713 total_time 733.0
episode 1820, reward 231.0, memory_length 2000, epsilon 0.2797077418463068 total_time 732.0
episode 1830, reward 415.0, memory_length 2000, epsilon 0.277756624531042 total_time 724.0
episode 1840, reward 339.0, memory_length 2000, epsilon 0.2758191173459537 total_time 724.0
episode 1850, reward 394.0, memory_length 2000, epsilon 0.2738951253528023 total_time 724.0
episode 1860, reward 177.0, memory_length 2000, epsilon 0.27198455427559504 total_time 731.0
episode 1870, reward 72.0, memory_length 2000, epsilon 0.27008731049596707 total_time 725.0
episode 1880, reward 208.0, memory_length 2000, epsilon 0.26820330104859336 total_time 731.0
episode 1890, reward 137.0, memory_length 2000, epsilon 0.2663324336166342 tot

### Tracking Convergence

In [None]:
agent.states_tracked

In [None]:
state_tracked_sample = [agent.states_tracked[i] for i in range(len(agent.states_tracked)) if agent.states_tracked[i] < 1000]

In [None]:
env.action_space[2]

### Plot the Q-Value convergence for state action pairs

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Q_value for state [0,0,0]  action (0,2)')
xaxis = np.asarray(range(0, len(state_tracked_sample)))
plt.plot(xaxis,np.asarray(state_tracked_sample))
plt.show()

In [None]:
score_tracked_sample = [score_tracked[i] for i in range(len(score_tracked)) if (i % 4 == 0)]

### Plot the Reward convergence for the tracked state.

In [None]:
plt.figure(0, figsize=(16,7))
plt.title('Reward for init state [0, 0, 0]')
xaxis = np.asarray(range(0, len(score_tracked_sample)))
plt.plot(xaxis,np.asarray(score_tracked_sample))
plt.show()

#### Epsilon-decay sample function

<div class="alert alert-block alert-info">
Try building a similar epsilon-decay function for your model.
</div>

In [None]:
time = np.arange(0,10000)
epsilon = []
for i in range(0,10000):
    epsilon.append(0 + (1 - 0) * np.exp(-0.0009*i))

In [None]:
plt.plot(time, epsilon)
plt.show()

In [None]:
time = np.arange(0,3000)
epsilon = []
for i in range(0,3000):
    epsilon.append(0 + (1 - 0.00001) * np.exp(-0.0007*i))
    z = np.random.random()

In [None]:
time = np.arange(0,3000)
epsilon = []
epsilon_c = 1
for i in range(0,3000):
    epsilon.append(epsilon_c)
    epsilon_c = epsilon_c * 0.999

In [None]:
plt.plot(time, epsilon)
plt.show()