Write a PyTorch implementation of a Variational Autoencoder (VAE) for the MNIST dataset with a 2D latent space. Please ensure that the following requirements are satisfied:

- Scale and normalize the dataset
- Division of the training dataset into training and validation sets.
- A convolutional neural network (CNN) for both the encoder and decoder.
- A Gaussian distribution for q(z|x) in the encoder
- A Gaussian distribution for p(x|z) in the decoder.
- The loss function is defined as a separate method, adjusted for the distribution used for the decoder.
- Use of GPU or MPS if available.


```python
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

# Use GPU or MPS if available
device = torch.device("cuda" if torch.cuda.is_available() else 
                      "mps" if torch.backends.mps.is_available() else 
                      "cpu")
print(f"Using device: {device}")

# Define transforms: Scale and normalize the dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))  # Mean and std of MNIST dataset
])

# Load the MNIST dataset
train_dataset_full = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

# Split training data into training and validation sets
train_size = int(0.9 * len(train_dataset_full))
val_size = len(train_dataset_full) - train_size
train_dataset, val_dataset = random_split(train_dataset_full, [train_size, val_size])

# Create DataLoaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define the Encoder using convolutional layers
class Encoder(nn.Module):
    def __init__(self, latent_dim=2):
        super(Encoder, self).__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 32, kernel_size=4, stride=2, padding=1)  # Output: 32 x 14 x 14
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2, padding=1)  # Output: 64 x 7 x 7
        self.conv3 = nn.Conv2d(64, 128, kernel_size=7)  # Output: 128 x 1 x 1

        # Fully connected layers for mean and log variance
        self.fc_mu = nn.Linear(128, latent_dim)
        self.fc_logvar = nn.Linear(128, latent_dim)

    def forward(self, x):
        x = F.relu(self.conv1(x))       # Size: batch x 32 x 14 x 14
        x = F.relu(self.conv2(x))       # Size: batch x 64 x 7 x 7
        x = F.relu(self.conv3(x))       # Size: batch x 128 x 1 x 1
        x = x.view(x.size(0), -1)       # Flatten: batch x 128
        mu = self.fc_mu(x)              # Mean of q(z|x)
        log_var = self.fc_logvar(x)     # Log variance of q(z|x)
        return mu, log_var

# Define the Decoder using transposed convolutional layers
class Decoder(nn.Module):
    def __init__(self, latent_dim=2):
        super(Decoder, self).__init__()
        # Fully connected layer to expand latent space
        self.fc = nn.Linear(latent_dim, 128 * 1 * 1)
        # Transposed convolutional layers
        self.deconv1 = nn.ConvTranspose2d(128, 64, kernel_size=7)                 # Output: batch x 64 x 7 x 7
        self.deconv2 = nn.ConvTranspose2d(64, 32, kernel_size=4, stride=2, padding=1)  # Output: batch x 32 x 14 x 14
        self.deconv3 = nn.ConvTranspose2d(32, 1, kernel_size=4, stride=2, padding=1)   # Output: batch x 1 x 28 x 28

    def forward(self, z):
        x = self.fc(z)                       # Expand latent vector
        x = x.view(-1, 128, 1, 1)            # Reshape to feature maps
        x = F.relu(self.deconv1(x))          # Upsample
        x = F.relu(self.deconv2(x))          # Upsample
        x = self.deconv3(x)                  # Final output layer (no activation)
        return x                             # Output is the mean of p(x|z)

# Define the Variational Autoencoder (VAE) model
class VAE(nn.Module):
    def __init__(self, latent_dim=2):
        super(VAE, self).__init__()
        self.encoder = Encoder(latent_dim)
        self.decoder = Decoder(latent_dim)
    
    def reparameterize(self, mu, log_var):
        # Reparameterization trick
        std = torch.exp(0.5 * log_var)
        epsilon = torch.randn_like(std)
        z = mu + epsilon * std
        return z
    
    def forward(self, x):
        mu, log_var = self.encoder(x)        # Encode input to Gaussian parameters
        z = self.reparameterize(mu, log_var) # Sample latent vector z
        x_recon = self.decoder(z)            # Reconstruct input from z
        return x_recon, mu, log_var

# Define the loss function as a separate method
def loss_function(x_recon, x, mu, log_var):
    # Reconstruction loss (Gaussian negative log-likelihood)
    recon_loss = F.mse_loss(x_recon, x, reduction='sum')
    # KL divergence loss
    kl_loss = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp())
    # Total loss
    total_loss = recon_loss + kl_loss
    return total_loss

# Initialize the VAE model and optimizer
latent_dim = 2
model = VAE(latent_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training function
def train(epoch):
    model.train()
    train_loss = 0
    for batch_idx, (data, _) in enumerate(train_loader):
        data = data.to(device)
        optimizer.zero_grad()
        x_recon, mu, log_var = model(data)
        loss = loss_function(x_recon, data, mu, log_var)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        if batch_idx % 100 == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} "
                  f"({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item() / len(data):.6f}")
    average_loss = train_loss / len(train_loader.dataset)
    print(f"====> Epoch: {epoch} Average loss: {average_loss:.4f}")

# Validation function
def validate(epoch):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for data, _ in val_loader:
            data = data.to(device)
            x_recon, mu, log_var = model(data)
            loss = loss_function(x_recon, data, mu, log_var)
            val_loss += loss.item()
    val_loss /= len(val_loader.dataset)
    print(f"====> Validation loss: {val_loss:.4f}")

# Train the model for a specified number of epochs
num_epochs = 10
for epoch in range(1, num_epochs + 1):
    train(epoch)
    validate(epoch)
```

This code implements a Variational Autoencoder (VAE) with a 2D latent space using PyTorch, satisfying the specified requirements:

- **Scaling and Normalizing the Dataset**: The MNIST dataset is scaled and normalized using the mean and standard deviation appropriate for MNIST (`transforms.Normalize((0.1307,), (0.3081,))`).
- **Division into Training and Validation Sets**: The training dataset is split into training and validation sets using `random_split`.
- **Convolutional Neural Networks for Encoder and Decoder**: Both the encoder and decoder are defined using convolutional layers (`nn.Conv2d` and `nn.ConvTranspose2d`).
- **Gaussian Distributions in Encoder and Decoder**: The encoder outputs the parameters (`mu` and `log_var`) for the Gaussian distribution `q(z|x)`, and the decoder outputs the mean of the Gaussian distribution `p(x|z)`.
- **Separate Loss Function**: The loss function is defined as a separate method (`loss_function`) and computes the reconstruction loss adjusted for a Gaussian decoder and the KL divergence.
- **Use of GPU or MPS if Available**: The code checks for available computation devices and uses them accordingly.

**Note**: Ensure that you have the latest version of PyTorch installed and that your environment supports CUDA or MPS if you intend to utilize GPU acceleration.