In [80]:
import gym
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
import numpy as np
import random
from collections import deque
import os
import matplotlib.pyplot as plt

1) More actions, less actions: What are appropriate number of actions to discretise the range -2.0 to 2.0?
2) Stability of training, i.e. should you train longer or cut it off within some number of episodes?
3) Track the reward, save weights, plot performance. Reproduce your best possible agent by loading your best weights and test it for say, 10 times. Does it consistently balance the pendulum for all 10 times when tested?
4) Exploration vs exploitation (the epsilon hyperparameter). Should you decay it?
5) Explain the differences between this code and the lab code for cartpole.

# NO NEED GPU! CPU will do!

In [81]:
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [82]:
class DQN:
    def __init__(self,
                 InputShape = 4,
                 NActions = 2,
                 Gamma = 0.95,  # Discount rate
                 epsilon = 1.0,  # Exploration rate
                 epsilon_min = 0.1,
                 epsilon_decay = 0.995,
                 learning_rate = 0.01,
                 ReplayMemorySize = 10000,
                 MinReplayMemory = 1000,
                 UpdateTargetEveryThisEpisodes = 1,
                 IntermediateSize = 64,
                 BatchSize = 32):
        
        # Hyperparameters. #
        
        self.InputShape = InputShape
        self.NActions = NActions
        self.Gamma = Gamma
        self.ReplayMemorySize = ReplayMemorySize
        self.MinReplayMemory = MinReplayMemory
        self.UpdateTargetEveryThisEpisodes = UpdateTargetEveryThisEpisodes
        self.IntermediateSize = IntermediateSize
        self.BatchSize = BatchSize
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.learning_rate = learning_rate
        
        self.total_reward = 0  # Initialize total reward accumulator
        self.episode_rewards = []  # List to store rewards for each episode
        self.average_rewards = []
    


        # Main model. #
        
        self.Main = self.CreateModel('Main')
        self.Optimiser = Adam()
        
        # Target model. #
        
        self.Target = self.CreateModel('Target')
        self.Target.set_weights(self.Main.get_weights())
        
        # Replay memory. #
        
        self.ReplayMemory = deque(maxlen = ReplayMemorySize)
        
        # Target network update counter. #
        
        self.TargetUpdateCounter = 0

    def save_weights(self, dir_path):
        print(f'Saving weights to {dir_path}')
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        Main_path = os.path.join(dir_path, 'Main')
        Target_path = os.path.join(dir_path, 'Target')
        self.Main.save_weights(Main_path + '_main.weights.h5')
        self.Target.save_weights(Target_path + '_target.weights.h5')
    
    
    def load_weights(self, path):
        print(f'Loading weights from {path}')
        Main_path = os.path.join(path, 'Main')
        Target_path = os.path.join(path, 'Target')
        self.Main.load_weights(Main_path + '_main.weights.h5')
        self.Target.load_weights(Target_path + '_target.weights.h5')

    def moving_average (self, values, window):
        weights = np.repeat(1.0, window)/window
        return np.convolve(values, weights, 'valid')
    
    def plot_rewards(self):
        plt.figure(figsize=(20, 5))
        plt.plot(self.episode_rewards, marker='o')
        plt.plot(self.moving_average(self.episode_rewards, 10), marker='o')
        plt.title('Episode Rewards')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.grid(True)
        plt.show()

    def plot_avg_rewards(self):
        plt.figure(figsize=(20, 5))
        plt.plot(self.average_rewards, marker='o')
        plt.plot(self.moving_average(self.average_rewards, 10), marker='o')
        plt.title('Episode Rewards')
        plt.xlabel('Episode')
        plt.ylabel('Average Reward')
        plt.grid(True)
        plt.show()


    def CreateModel(self, Type):
        inputs = Input(shape = (self.InputShape,), name = 'Input')
        x = Dense(self.IntermediateSize, activation = 'relu', name = '1stHiddenLayer')(inputs)
        x = Dense(self.IntermediateSize, activation = 'relu', name = '2ndHiddenLayer')(x)
        outputs = Dense(self.NActions, activation = 'linear', name = 'Output')(x)
        
        NN = Model(inputs, outputs, name = f'{Type}')
        NN.summary()
        
        return NN
    
    def UpdateReplayMemory(self, Information): # Information = (state, action, reward, SNext, Done)
        self.ReplayMemory.append(Information)

        # Epsilon-Greedy Policy to choose action
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            # pull random action (exploration)
            return random.randrange(self.action_size)
        # else pull current-best action (greedy; exploitation)
        act_values = self.model.predict(state, verbose = 0)
        return np.argmax(act_values[0])


    def Train(self, EndOfEpisode, reward):
        self.total_reward += reward  # Accumulate reward for the current episode

        # Only train if replay memory has enough data. #
        
        if len(self.ReplayMemory) < self.MinReplayMemory:
            print(f'DID NOT TRAIN..., replay memory = {len(self.ReplayMemory)}')
            return
        
        # Get batch of data for training. #
        
        TrainingData = random.sample(self.ReplayMemory, self.BatchSize)
        
        # Get states from training data, then get corresponding Q values. #
        
        ListOfS = np.array([element[0] for element in TrainingData])
        ListOfQ = np.array(self.Main(ListOfS))
        
        # Get future states from training data, then get corresponding Q values. #
        
        ListOfSNext = np.array([element[3] for element in TrainingData])
        ListOfQNext = self.Target(ListOfSNext)
        
        # Build actual training data for neural network. #
        
        X = []
        Y = []
        for index, (state, action, reward, SNext, Done) in enumerate(TrainingData):
            if not Done:
                MaxQNext = np.max(ListOfQNext[index])
                QNext = reward + self.Gamma * MaxQNext
            else:
                QNext = reward
            Q = ListOfQ[index]
            Q[action] = QNext
        
            X.append(state)
            Y.append(Q)
        
        # Train model using tf.GradientTape(), defined below.
    
        self.GTfit(X, Y)
                
        # Update target network every episode. #
        
        if EndOfEpisode:
            self.episode_rewards.append(self.total_reward)  # Store total reward for the episode
            self.total_reward = 0  # Reset total reward for the next episode
            self.TargetUpdateCounter += 1

        
        # Update target if counter is full. #
        
        if self.TargetUpdateCounter >= self.UpdateTargetEveryThisEpisodes:
            self.Target.set_weights(self.Main.get_weights())
            self.TargetUpdateCounter = 0

        # Decay epsilon gradually
        # if self.epsilon > self.epsilon_min:
        #     self.epsilon *= self.epsilon_decay
        #     print('epsilon:', self.epsilon)


    # This is the tf.GradientTape() which significantly speeds up training of neural networks
    @tf.function
    def GTfit(self, X, Y):
        
        # Train the neural network with this batch of data. #
        
        with tf.GradientTape() as tape:
            Predictions = self.Main(tf.convert_to_tensor(X), training = True)
            Loss = tf.math.reduce_mean(tf.math.square(tf.convert_to_tensor(Y) - Predictions))
        Grad = tape.gradient(Loss, self.Main.trainable_variables)
        self.Optimiser.apply_gradients(zip(Grad, self.Main.trainable_variables))

In [83]:
EnvName = 'Pendulum-v0'
IntermediateSize = 64
Epsilon_min = 0.1
Epsilon_decay = 0.99
Epsilon = 1.0
ShowEvery = 10
InputShape = 3
NActions = 10


In [84]:
def PendulumActionConverter(action, NActions=NActions):
    ActualTorque = (action / NActions - 0.5) * 4
    return ActualTorque

def PendulumInverseActionConverter(action, NActions=NActions):
    ActualA = round((action + 2) * (NActions - 1) / 4)
    return(ActualA)

def OneEpisode(DQN):
    env = gym.make(f'{EnvName}')
    state = env.reset()
    ListOfRewards = []
    Done = False
    global Epsilon
    while not Done:
        Q = DQN.Main(state.reshape(-1, state.shape[0]))
        if np.random.rand() < Epsilon:
            AStep = env.action_space.sample()
            action = PendulumInverseActionConverter(AStep[0])
        else:
            action = np.argmax(Q)
            action = PendulumActionConverter(action)
            AStep = np.array([action])
            action = PendulumInverseActionConverter(action)
        #Epsilon *= Epsilon_decay 
        if not _ % ShowEvery and len(DQN.ReplayMemory) >= DQN.MinReplayMemory:
            env.render()
        SNext, reward, Done, Info = env.step(AStep)
        DQN.UpdateReplayMemory((state, action, reward, SNext, Done))
        DQN.Train(Done, reward)
        ListOfRewards.append(reward)

        if Done:
            print(f'Finished! | Return: {np.sum(ListOfRewards)} | average reward: {np.mean(ListOfRewards)}')
            env.close()
            return ListOfRewards
        state = SNext

In [85]:
import time
STARTTIME = time.time()

dqn = DQN(InputShape = InputShape, NActions = NActions)
dir_path='best_dqn_weights'
dqn.load_weights(path=dir_path)

Epsilon = 1.0
EPISODES = 450
best_reward = -1000
avg_reward = []
for _ in range(EPISODES):

    print(f'Episode {_}')
    reward = OneEpisode(dqn)
    if Epsilon > Epsilon_min:
        Epsilon *= Epsilon_decay 
        print('Epsilon:',Epsilon)

    dqn.episode_rewards.append(np.sum(reward))
    dqn.average_rewards.append(np.mean(reward))
    print(f'Best reward: {best_reward}')
    if np.mean(reward) > best_reward:
        best_reward = np.mean(reward)
        dir_path = 'best_dqn_weights'
        print(f'Saving best model weights for episode {_} with reward {np.mean(reward)}')
        dqn.save_weights(dir_path=dir_path)



 
        


print(f'Total time taken: {time.time() - STARTTIME} seconds ...')


# Plot rewards
dqn.plot_rewards()

dqn.plot_avg_rewards()



Loading weights from best_dqn_weights
Episode 0
DID NOT TRAIN..., replay memory = 1
DID NOT TRAIN..., replay memory = 2
DID NOT TRAIN..., replay memory = 3
DID NOT TRAIN..., replay memory = 4
DID NOT TRAIN..., replay memory = 5
DID NOT TRAIN..., replay memory = 6
DID NOT TRAIN..., replay memory = 7
DID NOT TRAIN..., replay memory = 8
DID NOT TRAIN..., replay memory = 9
DID NOT TRAIN..., replay memory = 10
DID NOT TRAIN..., replay memory = 11
DID NOT TRAIN..., replay memory = 12
DID NOT TRAIN..., replay memory = 13
DID NOT TRAIN..., replay memory = 14
DID NOT TRAIN..., replay memory = 15
DID NOT TRAIN..., replay memory = 16
DID NOT TRAIN..., replay memory = 17
DID NOT TRAIN..., replay memory = 18
DID NOT TRAIN..., replay memory = 19
DID NOT TRAIN..., replay memory = 20
DID NOT TRAIN..., replay memory = 21
DID NOT TRAIN..., replay memory = 22
DID NOT TRAIN..., replay memory = 23
DID NOT TRAIN..., replay memory = 24
DID NOT TRAIN..., replay memory = 25
DID NOT TRAIN..., replay memory = 26

KeyboardInterrupt: 

In [None]:
dqn = DQN(InputShape = InputShape, NActions = NActions)

# Test the best agent by loading the best weights
dir_path = 'best_dqn_weights'

dqn.load_weights(path=dir_path)
Epsilon = 1.0
test_episodes = 10
test_rewards = []
for _ in range(test_episodes):
    print(f'Episode {_}')
    reward = OneEpisode(dqn)
    test_rewards.append(reward)

print(f'Test Rewards: {test_rewards}')
print(f'Average Test Reward: {np.mean(test_rewards)}')

env = gym.make(f'{EnvName}')
state = env.reset()
ListOfRewards = []
Done = False
while not Done:
    Q = dqn.Main(state.reshape(-1, state.shape[0]))
    action = np.argmax(Q)
    action = PendulumActionConverter(action)
    AStep = np.array([action])
    action = PendulumInverseActionConverter(action)
    env.render()
    SNext, reward, Done, Info = env.step(AStep)
    # DQN.UpdateReplayMemory((state, action, reward, SNext, Done))
    # DQN.Train(Done, reward)
    # ListOfRewards.append(reward)
    # all_rewards.append(reward)
    # all_sum_rewards.append(np.sum(ListOfRewards))
    state = SNext

Loading weights from best_dqn_weights
Episode 0
DID NOT TRAIN..., replay memory = 1
DID NOT TRAIN..., replay memory = 2
DID NOT TRAIN..., replay memory = 3
DID NOT TRAIN..., replay memory = 4
DID NOT TRAIN..., replay memory = 5
DID NOT TRAIN..., replay memory = 6
DID NOT TRAIN..., replay memory = 7
DID NOT TRAIN..., replay memory = 8
DID NOT TRAIN..., replay memory = 9
DID NOT TRAIN..., replay memory = 10
DID NOT TRAIN..., replay memory = 11
DID NOT TRAIN..., replay memory = 12
DID NOT TRAIN..., replay memory = 13
DID NOT TRAIN..., replay memory = 14
DID NOT TRAIN..., replay memory = 15
DID NOT TRAIN..., replay memory = 16
DID NOT TRAIN..., replay memory = 17
DID NOT TRAIN..., replay memory = 18
DID NOT TRAIN..., replay memory = 19
DID NOT TRAIN..., replay memory = 20
DID NOT TRAIN..., replay memory = 21
DID NOT TRAIN..., replay memory = 22
DID NOT TRAIN..., replay memory = 23
DID NOT TRAIN..., replay memory = 24
DID NOT TRAIN..., replay memory = 25
DID NOT TRAIN..., replay memory = 26

In [None]:
hyperparameters = {
    'EnvName': EnvName,
    'IntermediateSize': IntermediateSize,
    'Epsilon': Epsilon,
    'ShowEvery': ShowEvery,
    'InputShape': InputShape,
    'NActions': NActions
}

dqn = DQN(InputShape = InputShape, NActions = NActions)

# Test the best agent by loading the best weights
dir_path = 'best_dqn_weights'

dqn.load_weights(path=dir_path)
test_episodes = 10
Epsilon = 1.0
test_rewards = []
for _ in range(test_episodes):
    print(f'Episode {_}')
    reward = OneEpisode(dqn)
    test_rewards.append(reward)

print(f'Test Rewards: {test_rewards}')
print(f'Average Test Reward: {np.mean(test_rewards)}')

env = gym.make(f'{EnvName}')
state = env.reset()
ListOfRewards = []
Done = False
while not Done:
    Q = dqn.Main(state.reshape(-1, state.shape[0]))
    action = np.argmax(Q)
    action = PendulumActionConverter(action)
    AStep = np.array([action])
    action = PendulumInverseActionConverter(action)
    env.render()
    SNext, reward, Done, Info = env.step(AStep)

    state = SNext

print(f'Hyperparameters: {hyperparameters}')
print(f'Test Rewards: {test_rewards}')
print(f'Average Test Reward: {np.mean(test_rewards)}')

env.close()

Loading weights from best_dqn_weights
Episode 0
DID NOT TRAIN..., replay memory = 1
DID NOT TRAIN..., replay memory = 2
DID NOT TRAIN..., replay memory = 3
DID NOT TRAIN..., replay memory = 4
DID NOT TRAIN..., replay memory = 5
DID NOT TRAIN..., replay memory = 6
DID NOT TRAIN..., replay memory = 7
DID NOT TRAIN..., replay memory = 8
DID NOT TRAIN..., replay memory = 9
DID NOT TRAIN..., replay memory = 10
DID NOT TRAIN..., replay memory = 11
DID NOT TRAIN..., replay memory = 12
DID NOT TRAIN..., replay memory = 13
DID NOT TRAIN..., replay memory = 14
DID NOT TRAIN..., replay memory = 15
DID NOT TRAIN..., replay memory = 16
DID NOT TRAIN..., replay memory = 17
DID NOT TRAIN..., replay memory = 18
DID NOT TRAIN..., replay memory = 19
DID NOT TRAIN..., replay memory = 20
DID NOT TRAIN..., replay memory = 21
DID NOT TRAIN..., replay memory = 22
DID NOT TRAIN..., replay memory = 23
DID NOT TRAIN..., replay memory = 24
DID NOT TRAIN..., replay memory = 25
DID NOT TRAIN..., replay memory = 26

Exception ignored in: <function Viewer.__del__ at 0x000002688BBD8280>
Traceback (most recent call last):
  File "C:\Users\p2300575\AppData\Roaming\Python\Python39\site-packages\gym\envs\classic_control\rendering.py", line 165, in __del__
    self.close()
  File "C:\Users\p2300575\AppData\Roaming\Python\Python39\site-packages\gym\envs\classic_control\rendering.py", line 83, in close
    self.window.close()
  File "C:\Users\p2300575\AppData\Roaming\Python\Python39\site-packages\pyglet\window\win32\__init__.py", line 299, in close
    super(Win32Window, self).close()
  File "C:\Users\p2300575\AppData\Roaming\Python\Python39\site-packages\pyglet\window\__init__.py", line 823, in close
    app.windows.remove(self)
  File "c:\ProgramData\Anaconda3\lib\_weakrefset.py", line 114, in remove
    self.data.remove(ref(item))
KeyError: <weakref at 0x000002688E546B80; to 'Win32Window' at 0x000002688C291AC0>


Finished! | Return: -251.6355947287577 | average reward: -1.2581779736437886
Episode 5
Finished! | Return: -242.8397495817667 | average reward: -1.2141987479088336
Episode 6
Finished! | Return: -688.6334400539816 | average reward: -3.443167200269908
Episode 7
Finished! | Return: -379.90035175854524 | average reward: -1.8995017587927263
Episode 8
Finished! | Return: -509.25548458548974 | average reward: -2.5462774229274485
Episode 9
Finished! | Return: -411.50021250303365 | average reward: -2.0575010625151684
Test Rewards: [[-1.360768241254011, -1.7415987850831296, -2.3566990197385067, -3.2500898631799107, -4.470329119151151, -6.182376926245459, -8.17005018074026, -10.467504650263818, -12.953810076494284, -14.296959329260256, -12.254352702534051, -10.171335742191852, -8.16249689120849, -6.343837534957548, -4.794320457872284, -3.5417059399638964, -2.573669091991831, -1.94376755214045, -1.4692055187619308, -1.1187631679480805, -0.8657781289534399, -0.6913462301772781, -0.3910182214566206,