In [5]:
import numpy as np
import gym
import pandas as pd
import tensorflow
from keras.layers import Dense, Activation, Flatten
from keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
import numpy as np

In [6]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_dims):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)     
        self.action_memory = np.zeros((self.mem_size), dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
    
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - int(done)
        self.action_memory[index] = action
        self.mem_cntr += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        
        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, states_, terminal
    
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
                Flatten(input_shape=(input_dims)),
                Dense(fc1_dims),
                Activation('relu'),
                Dense(fc2_dims),
                Activation('relu'),
                Dense(n_actions)])
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mse')
    return model

class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims,
                 epsilon_dec=0.994, epsilon_end=0.01, epsilon_method='log',                 
                 update_method='hard_copy', tau=0.01, replace_target=100,
                 mem_size=500000):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.epsilon_method = epsilon_method
        self.batch_size = batch_size        
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.update_method = update_method
        self.tau = tau
        self.replace_target = replace_target
        
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 32, 64)
        self.q_target = build_dqn(alpha, n_actions, input_dims, 32, 64)
        
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        state = state[np.newaxis, :]
        if np.random.random() < self.epsilon: action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state)
            action = np.argmax(actions)
            
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size: return
        
        states, actions, rewards, states_, dones = self.memory.sample_buffer(self.batch_size)
        
        q_next = self.q_target.predict(states_)
        q_eval = self.q_eval.predict(states_)
        q_target = self.q_eval.predict(states)
        
        max_actions = np.argmax(q_eval, axis=1)
        
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index, actions] = rewards + self.gamma*q_next[batch_index, max_actions.astype(int)]*dones
                                
        self.q_eval.fit(states, q_target, verbose=0)    #q_eval hálózat tanítása
        
        self.update_network_parameters()                #q_target hálózat tanítása
            
    
    def update_network_parameters(self):
        #Periodikus hard-copy
        if self.memory.mem_cntr % self.replace_target == 0 and self.update_method == 'hard_copy':
            self.q_target.set_weights(self.q_eval.get_weights())
        
        #Polyak averaging
        if self.update_method == 'polyak':
            q_e = np.array(self.q_eval.get_weights())
            q_t = np.array(self.q_target.get_weights())
            self.q_target.set_weights(0.01*q_e + (1. - 0.01)*q_t)
        
    def update_epsilon(self):
        if self.epsilon_method == 'log': self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min
        if self.epsilon_method == 'linear': self.epsilon = self.epsilon-self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min
                                
    def save_model(self, fname = 'ddqn_model.h5'):
        self.q_eval.save(fname)
        
    def load_model(self, fname = 'ddqn_model.h5'):
        self.q_eval = load_model(fname)
        if self.epsilon == 0.0: self.update_network_parameters()


In [7]:
#Paraméterek:
env = gym.make('MountainCar-v0')
n_games = 1000                      # epizódok száma
max_steps = 500                     # egy epizódon belüli lépések maximális száma
alpha = 0.0005                      # neurális háló tanulási sebessége
gamma = 0.99                        # diszkonttényező jutalom számításánál
epsilon = 1.0                       # random cselekvés valószínűsége az első epizódban
epsilon_dec = 0.002                 # linear esetén levonjuk / log esetén beszorozzuk vele az előző értéket
epsilon_dec_method = 'linear'       # log és linear van egyelőre
epsilon_min = 0.01    
mem_size = 1000000                  # mennyi állapotváltozást tárolhat maximum a memóriában
batch_size = 64                     # ekkora kötegeken tanul a neurális háló
network_update_method = 'polyak'    # polyak vagy hard-copy
tau = 0.01                          # Csak a polyak esetén
replace_target = 100                # Csak hard-copy esetén, milyen időközönként frissitse a target hálózatot

#Ágens létrehozása, modell betöltése -ha kell-
agent = Agent(alpha=alpha, gamma=gamma, epsilon=epsilon, epsilon_dec=epsilon_dec,
                epsilon_end=epsilon_min, epsilon_method=epsilon_dec_method,
                input_dims=env.observation_space.shape, n_actions=env.action_space.n,
                mem_size=mem_size, batch_size=batch_size,
                update_method=network_update_method, tau=tau, replace_target=replace_target)

#   agent.load_model(fname = 'dqn_model.h5')   

scores = []
eps_history = []
ma100 = []

#Tanítás
for i in range(n_games):
    done = False
    score = 0
    #step_counter = 0
    observation = env.reset()
    
    for step in range(max_steps):
        #env.render()
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        #step_counter += 1
        score += reward
        agent.remember(observation, action, reward, observation_, done)
        observation = observation_
        agent.learn()
        if done:
            break

    scores.append(round(score,4))   
    avg_score = np.mean(scores[max(0, i-100):(i+1)])
    ma100.append(round(avg_score,4))
    eps_history.append(round(agent.epsilon,4))
    
    print('episode', i+1,
            'epsilon %.2f' % agent.epsilon,
            #'steps', step_counter,
            'score %.2f' % score,
            'ma100 %.2f' % avg_score)
        
    if (i+1) % 500 == 0 and i > 0:            
        agent.save_model(fname = 'ddqn_model' + str(i+1) +'.h5')
    
    if avg_score > 200:
        agent.save_model(fname = 'ddqn_model' + str(i+1) +'.h5')
        break
    
    if(i%10==0):
        print("Episode:"+str(i)+" Average score"+str(avg_score))

    agent.update_epsilon()
    
result_df = pd.DataFrame({'Score':scores, 'MA100':ma100, 'Epsilon':eps_history})
result_df.to_csv("ddqn_results.csv")
env.close()

  q_e = np.array(self.q_eval.get_weights())
  q_t = np.array(self.q_target.get_weights())


episode 1 epsilon 1.00 score -200.00 ma100 -200.00
Episode:0 Average score-200.0
episode 2 epsilon 1.00 score -200.00 ma100 -200.00
episode 3 epsilon 1.00 score -200.00 ma100 -200.00
episode 4 epsilon 0.99 score -200.00 ma100 -200.00
episode 5 epsilon 0.99 score -200.00 ma100 -200.00
episode 6 epsilon 0.99 score -200.00 ma100 -200.00
episode 7 epsilon 0.99 score -200.00 ma100 -200.00
episode 8 epsilon 0.99 score -200.00 ma100 -200.00
episode 9 epsilon 0.98 score -200.00 ma100 -200.00
episode 10 epsilon 0.98 score -200.00 ma100 -200.00
episode 11 epsilon 0.98 score -200.00 ma100 -200.00
Episode:10 Average score-200.0
episode 12 epsilon 0.98 score -200.00 ma100 -200.00
episode 13 epsilon 0.98 score -200.00 ma100 -200.00
episode 14 epsilon 0.97 score -200.00 ma100 -200.00
episode 15 epsilon 0.97 score -200.00 ma100 -200.00
episode 16 epsilon 0.97 score -200.00 ma100 -200.00
episode 17 epsilon 0.97 score -200.00 ma100 -200.00
episode 18 epsilon 0.97 score -200.00 ma100 -200.00
episode 19 e

KeyboardInterrupt: 