## Feel free to change the self.render and self.load_model to change the window displaying and the optimal weights i converged from being initially loaded, thank you.

In [None]:
# Provided Imports.
import gym
from gym.envs.classic_control.cartpole import *
from pyglet.window import key
import matplotlib.pyplot as plt
import time

# MY IMPORTS - ALL SHOULD BE EASILY AVAILABLE WITHIN ANACONDA.
import statistics
import sys
import random
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from collections import deque
from tensorflow import keras
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

# Variable initialization.
number_of_trials = 100 # This is 100 so as to ensure that the converged model will be shown for ep 50-100 worst case.

bool_do_not_quit = True  # Boolean to quit pyglet.

# Lists.
scores = []  # Your gaming score
episodes = [] # List containing all episodes played.


# Function that maps keys to program behaviour.
def key_press(k, mod):
    global bool_do_not_quit, a, restart
    if k == 0xff0d: restart = True    # Corresponds to the Enter Key.
    if k == key.ESCAPE: bool_do_not_quit = False
    if k == key.Q: bool_do_not_quit = False
        

# The class that defines the Deep Q Network Agent.
class DeepQNetworkAgent:
    # Constructor (self) initialising all the required parameters.
    def __init__(self, state_size, action_size):
        # Change these to true to see the game render / load the optimal weights that converge quickly. 
        self.render = False
        self.load_model = True

        # Establish the number of inputs and actions (4 inputs (observation values) | 2 actions - Left/Right).
        self.state_size = state_size    # 4.
        self.action_size = action_size  # 2.

        # These are hyper parameters for the DQN.
        self.discount_factor = 0.99 # Reward relative to position in time (future).
        self.learning_rate = 0.001  # Step size per iteration.
        self.epsilon = 1.0          # Epsilon greedy parameters below.
        self.epsilon_decay = 0.999  # Value that epsilon is multiplied by to hone behaviour.
        self.epsilon_min = 0.01     # The minimum amount epsilon can be.
        self.batch_size = 64        # The number of samples processed before the model is updated.
        self.train_start = 1000     # Memory value allocated for training.
        
        self.memory = deque(maxlen=2000) # The models memory, using the deque container.

        # Create main model and target model.
        self.model = self.build_model()
        self.target_model = self.build_model()

        # Initialize target model.
        self.update_target_model()
        
        # loads the optimal paramaters, if they are saved.
        if self.load_model:
            self.model = keras.models.load_model("./best_dnq.h5")


    # Standard sequential neural network.
    def build_model(self):
        model = Sequential() # Model type
        model.add(Dense(24, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')) # Input layer
        model.add(Dense(24, activation='relu', kernel_initializer='he_uniform'))                            # Hidden layer
        model.add(Dense(self.action_size, activation='linear', kernel_initializer='he_uniform'))            # Output layer

        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate)) # Compile the model
        return model


    # Update the target models weights to be the same as the models current weights.
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())


    # Decide which action to choose, by using the epsilon-greedy policy.
    def decide(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])


    # Save the current samples parameters to the models memory in deque.
    def add_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


    # Randomly select a sample from the models memory, using a mini batch sample.
    def train_model(self):
        # If still in the allocated training memory portion : return.
        if len(self.memory) < self.train_start:
            return
        
        # Take a sample based upon the batch size (64).
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)
        
        # Create two numpy arrays that are empty (0's).
        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        
        # list creation of our parameters.
        action, reward, done = [], [], []
        
        # loop through for the batch size, updating and appending our parameters.
        for i in range(self.batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])
        
        # Assign the new target values based upon the input from the mini batch sample.
        target = self.model.predict(update_input)
        target_val = self.target_model.predict(update_target)
        
        # Get the maximum Q value from the target model.
        for i in range(self.batch_size):
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                target[i][action[i]] = reward[i] + self.discount_factor * (np.amax(target_val[i]))

        # Finally, we can fit the model with our updated input as well as the target, for one iteration (epoch)
        self.model.fit(update_input, target, batch_size = self.batch_size, epochs=1, verbose=0)


def run_cartPole_asAgent():
    # Maximum score = 200 for v0.
    env = gym.make('CartPole-v0')
    
    # Get the number of states and actions from environment (4S : 2A)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    
    # Initialise our agent with the states and actions.
    agent = DeepQNetworkAgent(state_size, action_size)
    
    # Main loop - houses the repetition for each episode (trial)
    for e in range(number_of_trials):
        done = False        # Boolean, is the episode over?
        score = 0           # Measurement of agent performance.
        steps = -1   # Steps is always 1 above so its better to display 199 instead of 200, rather than always +1 for !200.
        t1 = time.time()    # Keeps track of the time the agent is active within our CartPole Env.
        
        # Reset our enviorment, and therefore associated action.
        state = env.reset()
        state = np.reshape(state, [1, state_size])
        
        # Secondary loop - houses the behaviour while the agent is active.
        while not done:
            # Early escape should quit keys be pressed.
            if bool_do_not_quit == False:
                sys.exit()
            
            # If the CartPole enviroment is being displayed, render it, and capture user input (key_press function).
            if agent.render:
                env.render();
                env.viewer.window.on_key_press = key_press

            # Get the current action, and assign the new parameters with the next step.
            action = agent.decide(state)
            next_state, reward, done, info = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])
            
            # If the agent's action causes the program to end (mistake), then penalise the agent.
            reward = reward if not done or score == 199 else -40

            # Save the current episode to the model memory in deque.
            agent.add_sample(state, action, reward, next_state, done)

            # Train the model based upon the prior actions.
            agent.train_model()
            
            # Update the score / state / step.
            score += reward
            state = next_state
            steps += 1
            
            # When the agent is done:
            if done:
                # Record the time taken.
                t1 = time.time()-t1
                
                # Update the target model for the next episode.
                agent.update_target_model()
                
                # Append the score to the list for averaging and displaying below.
                score = score if score == 200 else score + 40
                scores.append(score)
                
                # Append the episode to the list container, so that the total amount is always know, even with early exit.
                episodes.append(e)
                
                # Display the episodes score to the console.
                print("Episode", e, "| Score:", score, '|', steps, "steps | %0.2fs."% t1)

                # save weights externally if performance is nearly optimal.
                if np.mean(scores[-min(10, len(scores)):]) > 195:
                    agent.model.save("./best_dnq.h5")
                    
    # Close the gym env when done.                
    env.close()        


# Call the agent and run CartPole V0.
run_cartPole_asAgent()            

# Display the average score for the amount of episodes completed.
print("\n\nAverage Score for ", len(episodes), " episodes is:",statistics.mean(scores))

# Plot the models score over time.
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.title('My agent performance on CartPole-v0')
plt.ylabel('Score')
plt.xlabel('Agent Episode')
plt.show();
