# Deep Q-Network(DDQN)
We train NN in experience-replay mode<br>
This is done at each step during learning <br>
Training NN done in batches<br>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import gym
import numpy as np
import tensorflow as tf
from collections import deque # Double-Ended Queue which can add and remove elements from both ends. Supports both FIFO and LIFO.

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [None]:
#
# State output is [Pos Velocity Angle Angular-Velocity] for Cartpole
#
env = gym.make('CartPole-v1', new_step_api=True)
print(env.observation_space)
print(env.observation_space.shape)
print(env.observation_space.shape[0],'State space')
print(env.action_space.n,'Action space')

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
(4,)
4 State space
2 Action space


##Create Dense Neural Network (NN)

In [None]:
#
# Note: input_dim=4 is same as input_shape=(4,)
# Input is 4 nodes (from State space - vector of 4), 2 hidden layer of 32 nodes each, output 2 nodes
#
# So, input is s, when we predict it gives us highest Q-value with its associated action
# env.observation_space.shape[0] is 4
# env.action_space.n is 2
#
def model_init():
  model=Sequential()
  model.add(Dense(32, input_dim=env.observation_space.shape[0], activation='relu')) # input is 4
  model.add(Dense(32, activation='relu'))
  model.add(Dense(env.action_space.n, activation='linear')) # 2 output nodes, activation is linear for regression problem
  model.compile(optimizer=Adam(learning_rate=0.001, clipnorm=1.0), loss='mse') # auto clips gradients tau before updating wts
  return model

In [None]:
online_Model = model_init() # Create the online model
target_Model = model_init() # Create the target model
target_Model.set_weights(online_Model.get_weights())  # Copy initial weights

##Hyper Parameters

In [None]:
#-------------
# Parameters
#-------------

GAMMA = 0.99
EPSILON = 1.0
EPSILON_MIN = 0.01
EPSILON_DECAY = 0.995
MEMORYSIZE = 100000
BATCHSIZE = 64
C = 1000 # we update and sync target NN weights every C steps

#Train till 200 reward, then run for 100?

In [None]:
memory = deque(maxlen=MEMORYSIZE) #using deque (FIFO) for experience-replay memory
score = deque(maxlen=100) #keep track of scores of consecutive 100 episodes as defined in the winning requirement of Cartpole

##Q-Learning for reference purpose

In [None]:
#For each episode:
#    init s (reset env)
#
#    For each step till done:
#        execute eps-greedy policy to choose an action
#
#        sample next state s' and reward r based on chosen action
#
#        Q[s,a] = Q[s,a] + lr*(r + gamma*np.max(Q[s',:]) - Q[s,a]) #update Q table
#
#        s = s' # that is, we now move to state s'

In [None]:
# Function to copy weights from primary network to target network
def hard_update_target_model():
    target_Model.set_weights(online_Model.get_weights())

##Function to implement experience-replay

In [None]:
#
# Replay in batch mode to train NN
# (Train NN with 1 batch of random samples from replay buffer)
#
def experience_replay():
    if len(memory) < BATCHSIZE: return #only start replay after memory has at least BATCHSIZE samples

    xBatch = []
    yBatch = []

    #
    # Preparing xBatch and yBatch so as to train the NN model
    # That is, train NN to map states to QValues_sa
    #
    for s, a, r, s_, done in random.sample(memory, BATCHSIZE): # select 64 random tuples of past experience out from memory
                                                               # random.sample() returns a list of stuff
                                                               # note: s in numpy array form [[...]], cause model.predict()
                                                               #       demands this
        # Find Target
        #
        # target = r + gamma*np.max(Q[s',:])
        # note: gamma*np.max(Q[s',:]) is the discounted future reward
        #       also, this is simple DQN, no target network used, use back only primary network

        # Modified for DDQN
        if not done:
            # Online Model selects best action
            best_action = np.argmax(online_Model.predict(s_, verbose=0)[0])  # Select action using Online Model
            # Target Model evaluates the best action's Q-value
            target = r + GAMMA * target_Model.predict(s_, verbose=0)[0][best_action] # Use Target Model to evaluate
        else:
            target = r

        # Assign target calculated to the correct Q[s,a] (there are 2 actions, one of them gives you max value)
        # model.predict(s)[0] gives you output vector of 2 Q-values for the 2 possible actions
        QValues_sa = online_Model.predict(s, verbose=0)[0] #Predict always output [[...]], we change this to [...] and assign to QValues_sa
        QValues_sa[a] = target           #update it with new target value, QValues_sa is np array of shape (2,)

        # Create x,y batches of "ground truths" to train NN
        #
        # each sample in xBatch is one state s, ie, input is a vector of 4
        # with corresponding reward target (calculated ground truth)
        xBatch.append(s[0]) #s already in numpy format, change from [[...]] to [...] -> contains a batch of states
        yBatch.append(QValues_sa) #each QValues_sa is a vector of 2 output

    # Train NN with 1 batch of 64 samples for 1 grad descent (backprop)
    online_Model.fit(np.array(xBatch), np.array(yBatch), epochs=1, batch_size=BATCHSIZE, verbose=0) #do the NN training

## Learning

In [None]:
#
# SHORT CUT: OVERRIDE AND LOAD MODEL WITH PRE-TRAINED PARAMETERS
#            HELP SPEED UP TRAINING
#
from tensorflow.keras.models import load_model

# Load trained model
online_Model = tf.keras.models.load_model('/content/drive/MyDrive/CartPole.keras')

# Load last saved epsilon value
#EPSILON = np.load('/content/drive/MyDrive/epsilon.npy')

In [None]:
# ====
# MAIN
# ====

eps = 0.01
i=0 # Episode i
step_count = 0

while True: # we continue each episode i till the winning condition is achieved
    i=i+1
    state = np.array([env.reset()]) # [[Pos Velocity Angle Angular-Velocity]], we use numpy array
                                    # we make is an array of array because of model.predict() requires it
    reward_per_episode = 0

    ##
    ## For each step inside an Episode
    ##
    for t in range(500): # each episode only has max 500 time steps in CartPole
        #
        # eps-greedy policy
        #
        step_count += 1 # Increment Step count

        if np.random.rand() <= eps:
            action = env.action_space.sample()
        else:
            action = np.argmax(online_Model.predict(state, verbose=0)) #get indices with max value, do forward pass to predict
                                                                #the max QValues_sa of the input state, and select
                                                                #the associated action
        # step to next state
        state_, reward, done, _, _ = env.step(action) #  [...]
        next_state = np.array([state_])               # [[...]], because of model.predict() at experience_replay()
        reward_per_episode += reward

        # Add to deque memory for experience replay - To train NN later
        memory.append((state,action,reward,next_state,done)) # add into memory for experience replay as tuples

	      #
	      # Experience Replay at each time step to train the NN
        # Use 1 batch of experiences from memory to train
	      # NN so that it will learn to map from a state to QValues_sa (output vector of 2)
	      #
        experience_replay()

        # **Sync target network every C steps**
        if step_count % C == 0:
            target_Model.set_weights(online_Model.get_weights())  # Update target network

        if done:
            hard_update_target_model()  # Sync Target Network
            score.append(reward_per_episode)
            print(f"Episode: {i}, Score: {reward_per_episode}, Avg over last 100 tries: {sum(score)/len(score):.1f}, eps={eps:0.4f}")
            online_Model.save('/content/drive/MyDrive/CartPole.keras')  # Save model at end of each episode
            np.save('/content/drive/MyDrive/epsilon.npy', eps)  # Save epsilon value

            break

        state = next_state #S = S'

    # eps-decay
    #if EPSILON > EPSILON_MIN: EPSILON *= EPSILON_DECAY

    # STOP when exactly 200 reward is reached
    if reward_per_episode >= 200:
        print(f"Training Complete! Episode {i} reached 200 reward.")
        break


Episode: 1, Score: 76.0, Avg over last 100 tries: 76.0, eps=0.0100
Episode: 2, Score: 12.0, Avg over last 100 tries: 44.0, eps=0.0100
Episode: 3, Score: 10.0, Avg over last 100 tries: 32.7, eps=0.0100
Episode: 4, Score: 17.0, Avg over last 100 tries: 28.8, eps=0.0100
Episode: 5, Score: 9.0, Avg over last 100 tries: 24.8, eps=0.0100
Episode: 6, Score: 9.0, Avg over last 100 tries: 22.2, eps=0.0100
Episode: 7, Score: 9.0, Avg over last 100 tries: 20.3, eps=0.0100
Episode: 8, Score: 9.0, Avg over last 100 tries: 18.9, eps=0.0100
Episode: 9, Score: 9.0, Avg over last 100 tries: 17.8, eps=0.0100
Episode: 10, Score: 10.0, Avg over last 100 tries: 17.0, eps=0.0100
Episode: 11, Score: 9.0, Avg over last 100 tries: 16.3, eps=0.0100
Episode: 12, Score: 10.0, Avg over last 100 tries: 15.8, eps=0.0100
Episode: 13, Score: 10.0, Avg over last 100 tries: 15.3, eps=0.0100
Episode: 14, Score: 13.0, Avg over last 100 tries: 15.1, eps=0.0100
Episode: 15, Score: 92.0, Avg over last 100 tries: 20.3, eps=0.

**Then, with the trained model, run a 100 episodes (i.e., no more
training needed here) to find the average reward per episode for these
100 episodes. Print
the rewards obtained at every 10 episodes.**

In [None]:
# Evaluate trained model for 100 episodes
rewards = []
for ep in range(1, 101):
    state = np.array([env.reset()])
    total_reward = 0
    done = False

    while not done:
        action = np.argmax(online_Model.predict(state, verbose=0))
        state_, reward, done, _, _ = env.step(action)
        state = np.array([state_])
        total_reward += reward

    rewards.append(total_reward)

    # Print average reward every 10 episodes
    if ep % 10 == 0:
        avg = np.mean(rewards[ep-10:ep])
        print(f"Episode {ep-9} to {ep}: Avg Reward = {avg:.2f}")


Episode 1 to 10: Avg Reward = 178.20
Episode 11 to 20: Avg Reward = 175.50
Episode 21 to 30: Avg Reward = 126.80
Episode 31 to 40: Avg Reward = 221.80
Episode 41 to 50: Avg Reward = 133.00
Episode 51 to 60: Avg Reward = 162.80
Episode 61 to 70: Avg Reward = 169.80
Episode 71 to 80: Avg Reward = 174.90
Episode 81 to 90: Avg Reward = 175.50
Episode 91 to 100: Avg Reward = 169.20
