In [1]:
# Ideas taken from:
# https://github.com/mswang12/minDQN/blob/main/minDQN.py
# https://towardsdatascience.com/infinite-steps-cartpole-problem-with-variable-reward-7ad9a0dcf6d0
# https://deeplizard.com/learn/video/ewRw996uevN

# Deviations from the algorithms described:
# 1) I didnt sample from the replay memory but instead just used the whole memory and flushed it afterwards
# 2) I scored the current state using the formula given in the towards data science article
#    gave reward based on the difference between the score before and after the action was taken
#    reason is that we want to reward the action improving the state that we are in

import gym
import tensorflow as tf
import numpy as np
from tensorflow import keras
import random

physical_devices = tf.config.list_physical_devices('GPU')
print(tf.__version__)
print("Num GPUs:", len(physical_devices))

train_episodes = 100
test_episodes = 3

#definition of epsilon greedy: random action with probability epsilon
#Epsilon-greedy algorithm in initialized at 1 meaning every step is random at the start
#Epsilon of 1 means we are exploring all the time

epsilon = 1 
max_epsilon = 1 
min_epsilon = 0.01 
decay = 0.01

#discount rate for future rewards
gamma = 0.9

#update the target network every 10 episodes
target_update_episodes = 10

learning_rate = 0.001

batch_size = 128

2.8.0
Num GPUs: 1


In [4]:
env = gym.make('CartPole-v1')

#define model
#model has input of shape of observations
#model outputs the q values (expected rewards for action taken in that state)
#for cart-pole, 
#input vector is a vector of 4 [cart pos, cart vel, pole angle, pole tip vel] 
#output is a vector of 2 [move left, move right]

model = keras.Sequential()
model.add(keras.layers.Dense(24, input_shape=(4,), activation='relu'))
model.add(keras.layers.Dense(12, activation='relu'))
model.add(keras.layers.Dense(2, activation='linear'))

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['mse'])

#define target model
target_model = keras.Sequential()
target_model.add(keras.layers.Dense(24, input_shape=(4,), activation='relu'))
target_model.add(keras.layers.Dense(12, activation='relu'))
target_model.add(keras.layers.Dense(2, activation='linear'))

target_model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), metrics=['mse'])

target_model.set_weights(model.get_weights())

def get_next_action(state):
    if np.random.random() >= epsilon:
        return np.argmax(model.predict(state.reshape(1,-1)))
    else:
        return np.random.randint(2)
    
def perform_evaluation():
    scores = []

    for i in range(test_episodes):

        state = env.reset()
        done = False
        episode_score = 0

        while not done:

            env.render()
            action = np.argmax(model.predict(state.reshape(1,-1)))
            next_state,reward,done,_ = env.step(action)
            state = next_state
            episode_score += 1

        scores.append(episode_score)
        
    return sum(scores)/len(scores)

In [3]:
replay_memory = []
best_result = 0

for episode in range(train_episodes):
    
    state = env.reset()
    #we are giving a higher score for position and angle being closer to zero
    #normalising by twice the max**2 (2.4 for pos and 0.2095 for angle)
    score_old = 1 - (state[0]**2) / 11.52 - (state[2]**2) / 0.0877805
    done = False
    
    while not done:
        
        env.render()
        action = get_next_action(state)
        
        #env.step returns obs,reward,done,info
        next_state,_,done,_ = env.step(action)
        
        score = 1 - (state[0]**2) / 11.52 - (state[2]**2) / 0.0877805
        reward = score - score_old
        
        replay_memory.append(np.concatenate((state, [action], [reward], next_state)))
        
        score_old = score
        state = next_state
        
        if len(replay_memory) == batch_size:
            
            replay_memory = np.array(replay_memory)
            
            current_states = replay_memory[:,:4]
            actions = replay_memory[:,4]
            rewards = replay_memory[:,5]
            next_states = replay_memory[:,6:]
            
            replay_memory = []
            
            current_q_values = model.predict(current_states)
            next_q_values = target_model.predict(next_states)
            
            for i in range(batch_size):
                current_q_values[i,int(actions[i])] = rewards[i] + (gamma*np.max(next_q_values[i,:])) 
            
            model.fit(current_states, current_q_values, verbose = 0)
            #target_model.set_weights(model.get_weights())
            
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
    
    if (episode+1) % target_update_episodes == 0:
        
        target_model.set_weights(model.get_weights())
        result = perform_evaluation()
        
        print("Episode: " + str(episode+1)+ " Epsilon: " + str(epsilon) + " Eval: " + str(result))
        
        if result > 200 and result > best_result:
            best_result = result
            print("Saving Model...")
            model.save('cartpole_DQN_model/model')
            
env.close()

Episode: 10 Epsilon: 0.9147918734185159 Eval: 94.0
Episode: 20 Epsilon: 0.8286895426039287 Eval: 25.666666666666668
Episode: 30 Epsilon: 0.7507809319027796 Eval: 25.666666666666668
Episode: 40 Epsilon: 0.680286305753183 Eval: 120.0
Episode: 50 Epsilon: 0.616500130242572 Eval: 72.33333333333333
Episode: 60 Epsilon: 0.558784011887162 Eval: 51.0
Episode: 70 Epsilon: 0.5065603083753949 Eval: 157.66666666666666
Episode: 80 Epsilon: 0.4593063473295323 Eval: 153.0
Episode: 90 Epsilon: 0.41654919522482203 Eval: 91.0
Episode: 100 Epsilon: 0.37786092411182526 Eval: 315.3333333333333
Saving Model...
INFO:tensorflow:Assets written to: cartpole_DQN_model/model\assets


In [5]:
#Checking what the model is doing

model = tf.keras.models.load_model('cartpole_DQN_model/model')
test_episodes = 10
result = perform_evaluation()
print(result)
env.close()

362.6
