In [None]:
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque


class PuzzleEnvironment:
    def __init__(self):
        self.grid_size = (4, 4)
        self.initial_arr = np.array([[1, 2, 3, 4],
                                     [5, 6, 7, 8],
                                     [9, 10, 11, 0],  # 0 代表空格
                                     [13, 14, 15, 12]], dtype=int)
        self.target_arr = np.array([[1, 3, 4, 8],
                                    [6, 2, 11, 7],
                                    [5, 9, 0, 12],
                                    [13, 10, 14, 15]], dtype=int)

        self.reset()

    def reset(self):
        self.grid = self.initial_arr.copy()
        self.steps = 0
        self.empty_position = tuple(np.argwhere(self.grid == 0)[0])  # 獲取空格位置
        return self.get_state()

    def get_state(self):
        return self.grid.flatten()

    def is_position_valid(self, pos):
        return (0 <= pos[0] < self.grid_size[0]) and (0 <= pos[1] < self.grid_size[1])

    def swap_with_empty(self, cargo_pos):
        self.grid[self.empty_position], self.grid[cargo_pos] = self.grid[cargo_pos], self.grid[self.empty_position]
        self.empty_position = cargo_pos  # 更新空格位置

    def move_cargo(self):
        adjacent_positions = [
            (self.empty_position[0] - 1, self.empty_position[1]),  # 上
            (self.empty_position[0] + 1, self.empty_position[1]),  # 下
            (self.empty_position[0], self.empty_position[1] - 1),  # 左
            (self.empty_position[0], self.empty_position[1] + 1)   # 右
        ]

        valid_moves = [pos for pos in adjacent_positions if self.is_position_valid(pos)]
        
        if valid_moves:
            cargo_pos = random.choice(valid_moves)
            self.swap_with_empty(cargo_pos)

    def step(self, action):
        # 根據 action 進行移動
        if action == 0:  # 上
            new_pos = (self.empty_position[0] - 1, self.empty_position[1])
        elif action == 1:  # 下
            new_pos = (self.empty_position[0] + 1, self.empty_position[1])
        elif action == 2:  # 左
            new_pos = (self.empty_position[0], self.empty_position[1] - 1)
        else:  # 右
            new_pos = (self.empty_position[0], self.empty_position[1] + 1)

        if self.is_position_valid(new_pos):
            self.swap_with_empty(new_pos)

        done = np.array_equal(self.grid, self.target_arr)
        reward = self.calculate_reward()
        self.steps += 1
        return self.get_state(), reward, done

    def calculate_reward(self):
        total_distance = 0
        completed_cargos = 0
        max_reward = 1000

        for cargo_id in range(1, 16):
            cargo_pos = tuple(np.argwhere(self.grid == cargo_id)[0]) if cargo_id in self.grid else None
            if cargo_pos is not None:
                target_pos = tuple(np.argwhere(self.target_arr == cargo_id)[0])
                distance = np.abs(np.array(cargo_pos) - np.array(target_pos)).sum()
                total_distance += distance
                if distance == 0:
                    completed_cargos += 1

        reward = -total_distance + completed_cargos * 10 - self.steps * 0.1
        if completed_cargos == 15:
            reward += max_reward
        return reward


class DQNModel(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQNModel, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        
        self.gamma = 0.9
        self.alpha = 0.001
        self.epsilon = 1.0
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.9995
        self.batch_size = 32
        self.model = DQNModel(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.alpha)
        self.loss_fn = nn.MSELoss()

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, test=False):
        if not test and np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        
        state = torch.FloatTensor(state).unsqueeze(0)
        q_values = self.model(state).detach().numpy()[0]
        return np.argmax(q_values)

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = reward
            if not done:
                target += self.gamma * np.amax(self.model(next_state).detach().numpy()[0])
            
            target_f = self.model(state).detach().clone()
            target_f[0][action] = target  # 更新目標值

            output = self.model(state)
            loss = self.loss_fn(output, target_f)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


env = PuzzleEnvironment()
state_size = env.grid_size[0] * env.grid_size[1]
action_size = 4
agent = DQNAgent(state_size, action_size)

n_episodes = 1000
max_steps_per_episode = 30

movement_paths = []
best_path = None
best_reward = float('-inf')

for episode in range(n_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    episode_path = []

    for step in range(max_steps_per_episode):
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        total_reward += reward
        
        episode_path.append(env.grid.copy())
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        agent.replay()
        
        if done:
            break

    movement_paths.append(episode_path)
    
    # 只有在完成情況下才記錄最佳路徑
    if total_reward > best_reward and done:
        best_reward = total_reward
        best_path = episode_path.copy()
    
    print(f"Episode {episode + 1}/{n_episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon}, Done: {done}")

print("\nTesting the environment:")
state = env.reset()
done = False
test_steps = 0
test_path = []
test_reward = 0

while not done and test_steps < max_steps_per_episode:
    action = agent.act(state, test=True)
    next_state, reward, done = env.step(action)
    test_reward += reward
    
    test_path.append(env.grid.copy())
    state = next_state
    test_steps += 1

print(f"\nTesting finished after {test_steps} steps.")
print(f"Total Test Reward: {test_reward}, Best Reward during training: {best_reward}")

if test_reward < best_reward:
    print("Test result is worse than the best training result. Best path:")
    for step_num, step in enumerate(best_path, 1):
        print(f"Step {step_num}:\n{step}")
#########################################################4*4空格完成