# Ch15 遊戲對局實戰演練：DRL (Deep Reinforcement Learning)、DQN (Deep Q Network)

####15.3 建構 DQN 代理人

#### 15.3.1 前期準備

#### 匯入所需的模組

In [1]:
import random
import gym
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import os 

#### 設定超參數

In [2]:
env = gym.make('CartPole-v0')

In [3]:
state_size = env.observation_space.shape[0]

In [4]:
action_size = env.action_space.n
action_size

2

In [5]:
batch_size = 32

In [6]:
n_episodes = 1000 # 想玩的回合數

In [7]:
#!!!!注意，執行這個區塊後，請依網頁指示通過授權後載入 Google 雲端硬碟, 再執行底下其他程式區塊
from google.colab import drive
drive.mount('/content/drive')
output_dir = '/content/drive/MyDrive/Colab Notebooks/F1383_Sample/Ch15/model_output/cartpole/'  #註：請記得依你存放的位置彈性修改路徑

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

#### 15.3.2 定義 DQN 代理人的類別

In [9]:
class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 
        self.epsilon = 1.0
        self.epsilon_decay = 0.995 
        self.epsilon_min = 0.01 
        self.learning_rate = 0.001 
        self.model = self._build_model() 
    # 建立代理人的神經網路模型 – build_model()
    def _build_model(self):
        
        model = Sequential()
        model.add(Dense(32, activation='relu', 
                        input_dim=self.state_size))
        model.add(Dense(32, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    
    #儲存遊戲記憶 (經驗) – remember()
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, 
                            reward, next_state, done))

    #藉由記憶重播 (Experience Relay) 來訓練 – train()
    def train(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done: 
                target = (reward + 
                          self.gamma * # (target) = reward + (discount rate gamma) * 
                          np.amax(self.model.predict(next_state)[0])) 
            target_f = self.model.predict(state) 
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0) 
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    #採取某一行動 – act()
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0]) 
    
    #儲存與載入模型參數 – save()、load()
    def save(self, name):
        self.model.save_weights(name)

    def load(self, name):
        self.model.load_weights(name)

#### 15.4 與 OpenAI Gym 環境互動

In [10]:
agent = DQNAgent(state_size, action_size)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


#### 15.4.1 開始進行遊戲並展開訓練

In [11]:
#註：由於神經網路的初始權重參數是隨機設定的, 參雜了隨機性, 因此底下 (或您重跑一次) 的結果不會與書中完全一樣, 但模型的能力是相近的

for e in range(n_episodes):
    
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    
    done = False
    time = 0 
    while not done: 
        action = agent.act(state) 
        next_state, reward, done, _ = env.step(action) 
        reward = reward if not done else -10 
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done) 
        state = next_state 
        if done: 
            print("episode: {}/{}, score: {}, e: {:.2}" 
                  .format(e, n_episodes-1, time, agent.epsilon))
        time += 1
    if len(agent.memory) > batch_size:
        agent.train(batch_size) 
    if e % 50 == 0:
        agent.save(output_dir + "weights_" 
                   + '{:04d}'.format(e) + ".hdf5") 

episode: 0/999, score: 65, e: 1.0
episode: 1/999, score: 53, e: 0.99
episode: 2/999, score: 20, e: 0.99
episode: 3/999, score: 39, e: 0.99
episode: 4/999, score: 32, e: 0.98
episode: 5/999, score: 12, e: 0.98
episode: 6/999, score: 13, e: 0.97
episode: 7/999, score: 22, e: 0.97
episode: 8/999, score: 29, e: 0.96
episode: 9/999, score: 50, e: 0.96
episode: 10/999, score: 9, e: 0.95
episode: 11/999, score: 12, e: 0.95
episode: 12/999, score: 48, e: 0.94
episode: 13/999, score: 60, e: 0.94
episode: 14/999, score: 10, e: 0.93
episode: 15/999, score: 27, e: 0.93
episode: 16/999, score: 14, e: 0.92
episode: 17/999, score: 24, e: 0.92
episode: 18/999, score: 86, e: 0.91
episode: 19/999, score: 18, e: 0.91
episode: 20/999, score: 19, e: 0.9
episode: 21/999, score: 74, e: 0.9
episode: 22/999, score: 100, e: 0.9
episode: 23/999, score: 12, e: 0.89
episode: 24/999, score: 38, e: 0.89
episode: 25/999, score: 21, e: 0.88
episode: 26/999, score: 25, e: 0.88
episode: 27/999, score: 14, e: 0.87
episod