In [13]:
import torch
import copy
from collections import deque
from Gridworld import Gridworld
import numpy as np
import random

L1 = 64
L2 = 150
L3 = 100
L4 = 4

model = torch.nn.Sequential(
    torch.nn.Linear(L1,L2),
    torch.nn.ReLU(),
    torch.nn.Linear(L2,L3),
    torch.nn.ReLU(),
    torch.nn.Linear(L3,L4)
)

model2 = copy.deepcopy(model) # 完整複製主要Q網絡的架構，產生目標網路
model2.load_state_dict(model.state_dict()) # 將主要Q網絡的參數賦值給目標網絡
loss_fn = torch.nn.MSELoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

gamma = 0.9
epsilon = 1.0

epochs = 5000
losses = []
mem_size = 1000 # 設定記憶串列的大小
batch_size = 200 # 設定批次的大小
replay = deque(maxlen=mem_size)
max_moves = 50
sync_freq = 500 # 設定Q網路和目標網絡的參數同步頻率(每500steps就同步一次參數)
j = 0 # 記錄當前訓練次數
action_set = {
    0: 'u',
    1: 'd',
    2: 'l',
    3: 'r'
}

for i in range(epochs):
    game = Gridworld(size=4,mode="random")
    state1_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
    state1 = torch.from_numpy(state1_).float()
    status = 1
    mov = 0
    while(status == 1):
        j += 1 # 將訓練次數加一
        mov += 1
        qval = model(state1)
        qval_ = qval.data.numpy()
        if (random.random() < epsilon):
            action_ = np.random.randint(0,4)
        else:
            action_ = np.argmax(qval_)
        action = action_set[action_]
        game.makeMove(action)

        state2_ = game.board.render_np().reshape(1,64) + np.random.rand(1,64)/100.0
        state2 = torch.from_numpy(state2_).float()
        reward = game.reward()
        done = True if reward > 0 else False
        exp = (state1,action_,reward,state2,done)
        replay.append(exp)
        state1 = state2

        if len(replay) > batch_size:
            mini_batch = random.sample(replay,batch_size)

            state1_batch = torch.cat([s1 for (s1,a,r,s2,d) in mini_batch])
            action_batch = torch.Tensor([a for (s1,a,r,s2,d) in mini_batch])
            reward_batch = torch.Tensor([r for (s1,a,r,s2,d) in mini_batch])
            state2_batch = torch.cat([s2 for (s1,a,r,s2,d) in mini_batch])
            done_batch = torch.Tensor([d for (s1,a,r,s2,d) in mini_batch])

            Q1 = model(state1)
            with torch.no_grad(): # 用目標網絡計算目標Q值，但不要優化計算模型的參數
                Q2 = model2(state2_batch)
            Y = reward_batch + gamma*((1-done_batch)*torch.max(Q2,dim=1)[0])
            print(Y)
            X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            print(X)
            print("000")
#             Y = reward_batch + gamma * ((1-done_batch) * torch.max(Q2,dim=1)[0])
#             X = Q1.gather(dim=1,index=action_batch.long().unsqueeze(dim=1)).squeeze()
            print(X.shape)
            print(Y.shape)
            loss = loss_fn(X, Y.detach())
            print(i, loss.item())
            #clear_output(wait=True)
            optimizer.zero_grad()
            loss.backward()
            losses.append(loss.item())
            optimizer.step()
            
            if j % sync_freq == 0: #C
                model2.load_state_dict(model.state_dict())
        if reward != -1 or mov > max_moves:
            status = 0
            mov = 0
    if epsilon > 0.1:
        epsilon -= (1/epochs) # 讓ε的值隨著訓練的進行而慢慢下降，直到0.1(還是要保留探索的動作)
losses = np.array(losses)

plt.figure(figsize=(10,7))
plt.plot(losses)
plt.xlabel("Epochs",fontsize=11)
plt.ylabel("Loss",fontsize=11)
plt.show()

tensor([ -0.9856,  -1.0187,  -1.0108,  -1.0079,  -0.9856,  -1.0155,  -0.9909,
         -0.9900,  -1.0052,  -0.9874,  -1.0056,  -1.0004,  -1.0032,  -1.0122,
         -0.9957,  -1.0058,  -0.9969,  -1.0111,  -1.0050,  -0.9890,  -0.9854,
         -1.0175,  -1.0060,  -1.0111,  -1.0008,  -0.9903,  -0.9854,  -1.0116,
         -1.0152,  -1.0078,  -1.0124,  -1.0002,  -1.0038,  -0.9892,  -1.0022,
         -1.0127,  -0.9871,  -1.0018,  -0.9959,  -0.9949,  -0.9872,  -0.9854,
         -0.9945,  -1.0049,  10.0000, -10.0059,  -0.9958,  10.0000,  -1.0115,
         -1.0110,  -0.9956,  -0.9856,  -0.9905,  -1.0119,  -0.9933,  -1.0113,
         -0.9961,  -1.0058,  -1.0051,  -1.0051,  -1.0063,  -1.0080,  -1.0217,
         -1.0146,  10.0000,  -0.9952,  -1.0013,  -1.0027,  -1.0075,  -1.0005,
         -1.0116,  -1.0025,  -0.9906,  -1.0019,  -1.0128,  -1.0027,  -1.0169,
         -1.0124,  -1.0134,  -1.0049,  -1.0111,  -0.9852,  -9.9991,  -1.0113,
         -0.9967,  -1.0157,  -1.0048,  -0.9856,  -1.0026,  -0.99

RuntimeError: Size does not match at dimension 0 expected index [200, 1] to be smaller than self [1, 4] apart from dimension 1