In [None]:
import random
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.optimizers import Adam
from gym.envs.box2d import CarRacing
import cv2
import numpy as np
import os
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Conv2D, Dense, Flatten, BatchNormalization


# Baseline environment

## Build our own DQN archtechture 

In [None]:
class CarRacingDQNAgent:
    def __init__(
        self,
        action_space    = 5,
        frame_stack_num = 4,
        memory_size     = 5000,
        gamma           = 0.99,  # discount rate
        epsilon         = 1.0,   # exploration rate
        epsilon_min     = 0.1,
        epsilon_decay   = 0.9999,
        learning_rate   = 0.001,
        SAVE_FREQUENCY = 100
    ):
        self.action_space    = action_space
        self.frame_stack_num = frame_stack_num
        self.memory          = deque(maxlen=memory_size)
        self.gamma           = gamma
        self.epsilon         = epsilon
        self.epsilon_min     = epsilon_min
        self.epsilon_decay   = epsilon_decay
        self.learning_rate   = learning_rate
        self.SAVE_FREQUENCY  = SAVE_FREQUENCY
        self.model           = self.build_model()
        self.target_model    = self.build_model()
        self.update_target_model()


# The First archtecture 

    #def build_model(self):
        #Neural Net for Deep-Q learning Model
        #model = Sequential()
        #model.add(Conv2D(filters=6, kernel_size=(7, 7), strides=3, activation='relu', input_shape=(96, 96, self.frame_stack_num)))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
        #model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu'))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
        #model.add(Flatten())
        #model.add(Dense(216, activation='relu'))
        #model.add(Dense(self.action_space, activation='softmax'))
        #model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
        #return model


# The Second architecture 
    #def build_model(self):
        #Neural Net for Deep-Q learning Model
        #model = Sequential()
        #model.add(Conv2D(filters=6, kernel_size=(7, 7), strides=3, activation='relu', input_shape=(96, 96, self.frame_stack_num)))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
        #model.add(Conv2D(filters=12, kernel_size=(4, 4), activation='relu'))
        #model.add(MaxPooling2D(pool_size=(2, 2)))
        #model.add(Flatten())
        #model.add(Dense(216, activation='relu'))
        #model.add(Dense(128, activation='relu'))  # added new dense layer
        #model.add(Dense(64, activation='relu'))   # added another dense layer
        #model.add(Dense(self.action_space, activation='softmax'))
        #model.compile(loss='mean_squared_error', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
       #return model

# The Third archtecture
    #def build_model(self):
    # Neural Net for Deep-Q learning Model
        #model = Sequential()
        #model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=( 96, 96, 4), data_format='channels_first'))
        #model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu', data_format='channels_first'))
        #model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu', data_format='channels_first'))
        #model.add(Flatten())
        #model.add(Dense(512, activation='relu'))
        #model.add(Dense(self.action_space, activation='linear'))
        #model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
        #return model


# The Fourth archtechture
    def build_model(self):
    # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Conv2D(filters=64, kernel_size=(8, 8), strides=4, activation='relu', input_shape=(4, 96, 96), padding='same'))
        model.add(Conv2D(filters=128, kernel_size=(4, 4), strides=2, activation='relu', padding='same'))
        model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=1, activation='relu', padding='same'))
        model.add(BatchNormalization())
        model.add(Conv2D(filters=256, kernel_size=(3, 3), strides=1, activation='relu', padding='same'))
        model.add(Conv2D(filters=512, kernel_size=(3, 3), strides=1, activation='relu', padding='same'))
        model.add(BatchNormalization())
        model.add(Conv2D(filters=512, kernel_size=(3, 3), strides=1, activation='relu', padding='same'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_space, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate, epsilon=1e-7))
        return model
    
    
    
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def memorize(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() > self.epsilon:
            act_values = self.model.predict(np.expand_dims(state, axis=0))
            action_index = np.argmax(act_values[0])
        else:
            action_index = random.randint(0,self.action_space-1)
        return action_index

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        train_state = []
        train_target = []
        for state, action_index, reward, next_state, done in minibatch:
            state = np.array(state) #convert deque to numpy array
            #target = self.model.predict(np.expand_dims(state, axis=0))[0]
            target = self.model.predict(state.reshape(1, *state.shape))[0]  # calculate Q-values
            if done:
                target[action_index] = reward
            else:
                t = self.target_model.predict(np.expand_dims(next_state, axis=0))[0]
                target[action_index] = reward + self.gamma * np.amax(t)
            train_state.append(state)
            train_target.append(target)
        self.model.fit(np.array(train_state), np.array(train_target), epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def load(self, name):
        self.model.load_weights(name)
        self.update_target_model()

    def save(self, name):
        self.target_model.save_weights(name)


In [None]:
def process_state_image(state):
    state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY)
    state = state.astype(float)
    state /= 255.0
    return state


## Training

In [None]:
RENDER                        = True
STARTING_EPISODE              = 1
ENDING_EPISODE                = 2000
SKIP_FRAMES                   = 2
TRAINING_BATCH_SIZE           = 64
SAVE_TRAINING_FREQUENCY       = 5
UPDATE_TARGET_MODEL_FREQUENCY = 5



In [None]:
env=CarRacing(
        grayscale=0,
        show_info_panel=0,
        discretize_actions='hard',
        frames_per_state=1,
        num_lanes=1,
        num_lanes_changes=1,
        num_tracks=1,
        allow_reverse=False,
        max_time_out=2,
        verbose=0
        )

agent = CarRacingDQNAgent()


for e in range(STARTING_EPISODE, ENDING_EPISODE+1):
    init_state = env.reset()
    init_state=process_state_image(init_state)
    total_reward = 0
    negative_reward_counter = 0
    state_frame_stack_queue = deque([init_state]*agent.frame_stack_num, maxlen=agent.frame_stack_num)
    time_frame_counter = 1
    done = False

    while True:
        if RENDER:
            env.render('human')

        current_state_frame_stack = state_frame_stack_queue #generate_state_frame_stack_from_queue(state_frame_stack_queue)
        action = agent.act(current_state_frame_stack)

        reward = 0
        for _ in range(SKIP_FRAMES+1):
            next_state, r, done, info = env.step(action)
            reward += r
            if done:
                break

        total_reward += reward

        next_state = process_state_image(next_state)
        state_frame_stack_queue.append(next_state)
        next_state_frame_stack = state_frame_stack_queue #generate_state_frame_stack_from_queue(state_frame_stack_queue)

        agent.memorize(current_state_frame_stack, action, reward, next_state_frame_stack, done)

        if done:
            print('Episode: {}/{}, Scores(Time Frames): {}, Total Rewards(adjusted): {:.2}, Epsilon: {:.2}'.format(e, ENDING_EPISODE, time_frame_counter, float(total_reward), float(agent.epsilon)))
            break
        if len(agent.memory) > TRAINING_BATCH_SIZE:
            agent.replay(TRAINING_BATCH_SIZE)
        time_frame_counter += 1

    if e % UPDATE_TARGET_MODEL_FREQUENCY == 0:
        agent.update_target_model()
        
    if e % SAVE_TRAINING_FREQUENCY == 0:    
        save_path = os.path.join("C:\DSBA\Term 2\Reinforcement Learning\Project\DQNmodel", f"car_racing_dqn_agent_{e}.h5")
        agent.save("C:\DSBA\Term 2\Reinforcement Learning\Project\DQNmodel")

env.close()

####################################
No support for several frames in RGB
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode: 1/2000, Scores(Time Frames): 49, Total Rewards(adjusted): -1.1e+02, Epsilon: 1.0
Episode: 2/2000, Scores(Time Frames): 276, Total Rewards(adjusted): -1.6e+02, Epsilon: 0.97
Episode: 3/2000, Scores(Time Frames): 88, Total Rewards(adjusted): -1.2e+02, Epsilon: 0.97
Episode: 4/2000, Scores(Time Frames): 190, Total Rewards(adjusted): -1.5e+02, Epsilon: 0.95
Episode: 5/2000, Scores(Time Frames): 164, Total Rewards(adjusted): -1.4e+02, Epsilon: 0.93
Episode: 6/2000, Scores(Time Frames): 78, Total Rewards(adjusted): -1.2e+02, Epsilon: 0.93
