In [7]:
import sys
import gym
#import pylab
import random
import numpy as np
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential

from scores.score_logger import ScoreLogger

EPISODES = 300
LEARNING_RATE = 0.001

In [8]:
# Double DQN Agent for the Cartpole
# it uses Neural Network to approximate q function
# and replay memory & target q network
class DoubleDQNAgent:
    def __init__(self, state_size, action_size):
        # if you want to see Cartpole learning, then change to True
        self.render = False
        self.load_model = False
        # get size of state and action
        self.state_size = state_size
        self.action_size = action_size

        # these is hyper parameters for the Double DQN
        self.discount_factor = 0.99
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.01
        self.batch_size = 64
        self.train_start = 1000
        # create replay memory using deque
        self.memory = deque(maxlen=2000)

        # create main model and target model
        self.model = self.build_model()
        self.target_model = self.build_model()

        # initialize target model
        self.update_target_model()

        if self.load_model:
            self.model.load_weights("./save_model/cartpole_ddqn.h5")

    # approximate Q function using Neural Network
    # state is input and Q Value of each action is output of network
    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(24, activation='relu',
                        kernel_initializer='he_uniform'))
        model.add(Dense(self.action_size, activation='linear',
                        kernel_initializer='he_uniform'))
        model.summary()
        model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))
        return model

    # after some time interval update the target model to be same with model
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    # get action from model using epsilon-greedy policy
    def get_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(state)
            return np.argmax(q_value[0])

    # save sample <s,a,r,s'> to the replay memory
    def append_sample(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    # pick samples randomly from replay memory (with batch_size)
    def train_model(self):
        if len(self.memory) < self.train_start:
            return
        batch_size = min(self.batch_size, len(self.memory))
        mini_batch = random.sample(self.memory, batch_size)

        update_input = np.zeros((batch_size, self.state_size))
        update_target = np.zeros((batch_size, self.state_size))
        action, reward, done = [], [], []

        for i in range(batch_size):
            update_input[i] = mini_batch[i][0]
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            update_target[i] = mini_batch[i][3]
            done.append(mini_batch[i][4])

        target = self.model.predict(update_input)
        target_next = self.model.predict(update_target)
        target_val = self.target_model.predict(update_target)

        for i in range(self.batch_size):
            # like Q Learning, get maximum Q value at s'
            # But from target model
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # the key point of Double DQN
                # selection of action is from model
                # update is from target model
                a = np.argmax(target_next[i])
                target[i][action[i]] = reward[i] + self.discount_factor * (
                    target_val[i][a])

        # make minibatch which includes target q value and predicted q value
        # and do the model fit!
        self.model.fit(update_input, target, batch_size=self.batch_size,
                       epochs=1, verbose=0)



In [9]:
class RewardPredictor:
    
    def __init__(self,input_shape,output_shape):
        self.input_shape = input_shape
        self.output_shape = output_shape
        
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.input_shape,), activation="relu"))
        self.model.add(Dense(10, activation="relu"))
        self.model.add(Dense(self.output_shape, activation="sigmoid"))
        self.model.compile(loss="binary_crossentropy", optimizer=Adam(lr=LEARNING_RATE))
    
    def fit(self,state,reward):
        self.model.fit(state, reward, verbose=0)
    
    def predict(self,state):
        reward = self.model.predict(state)
        if reward > 0.5:
            reward = 1
        else:
            reward = -1
        return reward
    
    def save_model(self):
        self.model.save('my_model_reward_predictor.h5')
        

In [10]:
ENV_NAME = "CartPole-v1"
env = gym.make(ENV_NAME)
# get size of state and action from environment
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

score_logger = ScoreLogger(ENV_NAME)

agent = DoubleDQNAgent(state_size, action_size)

reward_noise_remover = RewardPredictor(state_size + 1,1)

scores, episodes = [], []
run = 0

"""
Implement the noisy reward
set the noise power noise_power
"""
noise_power = 0.1
reward_space = [-1,1]

counter_to_use_reward = 0
dont_count = False
print_flag = True

for e in range(EPISODES):
    done = False
    score = 0
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    step = 0
    run += 1
    while not done:
        step += 1
        if agent.render:
            env.render()

        # get action for the current state and go one step in environment
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        # if an action make the episode end, then gives penalty of -100
        #reward = reward if not done or score == 499 else -100
        reward = reward if not done else -reward
        
        random_number = np.random.uniform(0,1,1)
        if random_number < noise_power:
            reward = -reward
        
        ## Noise Filtering Process
        
        reward_noise_state = state
        reward_noise_state = np.concatenate((reward_noise_state,action),axis=None)
        reward_noise_state = np.reshape(reward_noise_state, [1, state_size+1])
        #print(reward_noise_state.shape)
        ## Filter noise
        old_reward = 1 if reward > 0 else 0
        old_reward = np.reshape(old_reward, [1, 1])
        if not dont_count :
            counter_to_use_reward += 1
        if counter_to_use_reward > 1000:
            if print_flag:
                print('noise remover activated')
                print_flag = False
            dont_count = True
            reward = reward_noise_remover.predict(reward_noise_state)
        
        ##
        reward_noise_remover.fit(reward_noise_state, old_reward)
        
        ########

        # save the sample <s, a, r, s'> to the replay memory
        agent.append_sample(state, action, reward, next_state, done)
        # every time step do the training
        agent.train_model()
        score += reward
        state = next_state

        if done:
            # every episode update the target model to be same with model
            agent.update_target_model()

            # every episode, plot the play time
            #score = score if score == 500 else score + 100
            #scores.append(score)
            
            print("Run: " + str(run) + ", exploration: " + str(agent.epsilon) + ", score: " + str(step))
            score_logger.add_score(step, run)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_11 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_13 (Dense)             (None, 24)                120       
_________________________________________________________________
dense_14 (Dense)             (None, 24)                600       
_________________________________________________________________
dense_15 (De

  z = np.polyfit(np.array(trend_x), np.array(y[1:]), 1)


Scores: (min: 15, avg: 20, max: 25)

Run: 3, exploration: 0.925854183751895, score: 37
Scores: (min: 15, avg: 25.666666666666668, max: 37)

Run: 4, exploration: 0.9111511025165902, score: 16
Scores: (min: 15, avg: 23.25, max: 37)

Run: 5, exploration: 0.9002772252562138, score: 12
Scores: (min: 12, avg: 21, max: 37)

Run: 6, exploration: 0.8789172357313328, score: 24
Scores: (min: 12, avg: 21.5, max: 37)

Run: 7, exploration: 0.8701675093639105, score: 10
Scores: (min: 10, avg: 19.857142857142858, max: 37)

Run: 8, exploration: 0.8606433826830369, score: 11
Scores: (min: 10, avg: 18.75, max: 37)

Run: 9, exploration: 0.8520755747117399, score: 10
Scores: (min: 10, avg: 17.77777777777778, max: 37)

Run: 10, exploration: 0.8419067177676068, score: 12
Scores: (min: 10, avg: 17.2, max: 37)

Run: 11, exploration: 0.831859218194368, score: 12
Scores: (min: 10, avg: 16.727272727272727, max: 37)

Run: 12, exploration: 0.8202885863627752, score: 14
Scores: (min: 10, avg: 16.5, max: 37)

Run: 13

Scores: (min: 8, avg: 14.428571428571429, max: 37)

Run: 85, exploration: 0.2935777088557856, score: 13
Scores: (min: 8, avg: 14.411764705882353, max: 37)

Run: 86, exploration: 0.2912372909409696, score: 8
Scores: (min: 8, avg: 14.337209302325581, max: 37)

Run: 87, exploration: 0.28776160118260813, score: 12
Scores: (min: 8, avg: 14.310344827586206, max: 37)

Run: 88, exploration: 0.284327391068757, score: 12
Scores: (min: 8, avg: 14.284090909090908, max: 37)

Run: 89, exploration: 0.28093416557223355, score: 12
Scores: (min: 8, avg: 14.258426966292134, max: 37)

Run: 90, exploration: 0.2781374323007875, score: 10
Scores: (min: 8, avg: 14.21111111111111, max: 37)

Run: 91, exploration: 0.2734467341692626, score: 17
Scores: (min: 8, avg: 14.241758241758241, max: 37)

Run: 92, exploration: 0.26910424739696437, score: 16
Scores: (min: 8, avg: 14.26086956521739, max: 37)

Run: 93, exploration: 0.26536117873480936, score: 14
Scores: (min: 8, avg: 14.258064516129032, max: 37)

Run: 94, exp

KeyboardInterrupt: 

In [None]:
# run 25 noise remover activated
# run 67
# run 21

In [5]:
agent.model.save("my_model_filtered_noisy_reward.h5")

In [6]:
reward_noise_remover.save_model()