# PPO Implementation for the 15 Puzzle Game

This notebook implements a Proximal Policy Optimization (PPO) agent to solve the 15 Puzzle Game.

In [None]:
!pip install gym torch numpy

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import random
from collections import deque


## Define the 15 Puzzle Environment

In [None]:
class FifteenPuzzleEnv:
    def __init__(self):
        self.size = 4
        self.reset()

    def reset(self):
        self.state = np.arange(self.size * self.size)
        np.random.shuffle(self.state)
        return self.get_state()

    def get_state(self):
        return self.state.copy()

    def is_done(self):
        return np.array_equal(self.state, np.arange(self.size * self.size))

    def get_possible_actions(self):
        empty_pos = np.where(self.state == 0)[0][0]
        x, y = divmod(empty_pos, self.size)
        actions = []
        if x > 0: actions.append('up')
        if x < self.size - 1: actions.append('down')
        if y > 0: actions.append('left')
        if y < self.size - 1: actions.append('right')
        return actions

    def step(self, action):
        empty_pos = np.where(self.state == 0)[0][0]
        x, y = divmod(empty_pos, self.size)
        new_pos = empty_pos
        if action == 'up': new_pos -= self.size
        elif action == 'down': new_pos += self.size
        elif action == 'left': new_pos -= 1
        elif action == 'right': new_pos += 1
        self.state[empty_pos], self.state[new_pos] = self.state[new_pos], self.state[empty_pos]
        reward = 1 if self.is_done() else -0.1
        done = self.is_done()
        return self.get_state(), reward, done


## Define the PPO Agent

In [None]:
class ActorCritic(nn.Module):
    def __init__(self, input_dim, action_dim):
        super().__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
        )
        self.policy_head = nn.Linear(128, action_dim)
        self.value_head = nn.Linear(128, 1)

    def forward(self, x):
        x = self.fc(x)
        return self.policy_head(x), self.value_head(x)


## Training Loop (To Be Completed)

In [None]:
# You would implement the PPO training loop here,
# including rollout collection, advantage estimation,
# and policy/value network updates.