# Simulating N samples simultaneously

Since we are only interested in ground level, we only need to store ground level data (Saves memory).

This means that our simulation can be reduce to just a 2D computation.

In [1]:
def run_simulation_2D(x, y, nx, ny, Lx, Ly, cx, cy, sx, sy):
    dx, dy = Lx/(nx-1), Ly/(ny-1)
    dt = 1
    tend = 1200
    t = 0

    cfl_x, cfl_y = cx * dt/dx, cy * dt/dy
    diff_x, diff_y = sx * dt/dx**2, sy * dt/dy**2

    u = np.zeros((nx+2, ny+2))
    sol = []
    source_x, source_y = nx // 2, ny // 2
    Q = 1e-6
    
    while t < tend:
        unew = u.copy()
        sol.append(u[1:-1, 1:-1])

         # Advection (Upwind Scheme)
        unew[1:-1, 1:-1] -= cfl_x * (u[1:-1, 1:-1] - u[1:-1, :-2])
        unew[1:-1, 1:-1] -= cfl_y * (u[1:-1, 1:-1] - u[:-2, 1:-1])
    
        # Diffusion (Central Differencing)
        unew[1:-1, 1:-1] += diff_x * (u[1:-1, 2:] - 2*u[1:-1, 1:-1] + u[1:-1, :-2])
        unew[1:-1, 1:-1] += diff_y * (u[2:, 1:-1] - 2*u[1:-1, 1:-1] + u[:-2, 1:-1])

        # Source Term
        unew[source_x, source_y] += Q * dt

        # Additional Source Points (forming a small area)
        offsets = [(-1, -1), (-1, 1), (1, -1), (1, 1), (-1, 0), (1, 0), (0, -1), (0, 1)]
        for dx, dy in offsets:
            unew[source_x + dx, source_y + dy] += Q * dt

        u = unew
        t += dt
        
    return np.array(sol)

In [2]:
from joblib import Parallel, delayed
import numpy as np
import time as time

nx, ny= 51, 51  # Grid points
Lx, Ly = 5000, 5000  # Domain size in meters
x = np.linspace(-2500, 2500, nx)  # Centered at (0,0)
y = np.linspace(-2500, 2500, ny)
n = 1000
cx, cy = np.random.RandomState().uniform(0, 10, n), np.random.RandomState().uniform(0, 10, n)
sx, sy = np.random.RandomState().uniform(0, 1, n), np.random.RandomState().uniform(0, 1, n)
num_cores = -1

start_time = time.time()
results = Parallel(n_jobs=num_cores)(
    delayed(run_simulation_2D)(x, y, nx, ny, Lx, Ly, cx[i], cy[i], sx[i], sy[i])
    for i in range(n)
)
end_time = time.time()

print(f"Simulation took: {end_time-start_time}")

Simulation took: 36.63258719444275


In [3]:
observed = np.load("test.npy")
observed.shape, results[0].shape

((51, 51, 1200), (1200, 51, 51))

We need to fix the shapes so that they correspond to each other.

Currently the simulated results is represented in an 3-D array, where each array within a timestep, and the respective concentration in the x-y grid. The observed is also in a 3-D array, however in shape (51, 51, 1200). This should mean that for each x-grid, it shows the value of y over time 1200.

Using `np.reshape` work in matching the dimensions. However, as the time and spatial dimensions are handled differently, it may not yield the same definition. Specifically, each slice in the observed in a seperate calculation per time step (instantaneous snapshots at t_i), whereas the simulation tracks concentration evolving over time. Time for simulated is the first axis, whereas it is last axis for observed.

In [4]:
results[0].reshape((51, 51, 1200)).shape, observed.shape

((51, 51, 1200), (51, 51, 1200))

An alternative here is to use `np.tranpose()`. This gives us more clarity into how we transform the simulated data.

Structure for simulated solution (axis 0: Time, axis 1:, Nx, axis 2: Ny) and analytical solution (axis 0: Nx, axis 1: Ny, axis 2: Time).

By using using `np.transpose(results[0], (1, 2, 0))`, the axes are rearranged to be:

- Simulated solution (axis 0: Nx, axis 1:, Ny, axis 2: Time)

Which is what we wanted.

In [5]:
np.transpose(results[0], (1, 2, 0)).shape

(51, 51, 1200)

The updated function now looks like this:

In [1]:
def run_simulation_2D(x, y, nx, ny, Lx, Ly, cx, cy, sx, sy):
    dx, dy = Lx/(nx-1), Ly/(ny-1)
    dt = 1
    tend = 1200
    t = 0

    cfl_x, cfl_y = cx * dt/dx, cy * dt/dy
    diff_x, diff_y = sx * dt/dx**2, sy * dt/dy**2

    u = np.zeros((nx+2, ny+2))
    sol = []
    source_x, source_y = nx // 2, ny // 2
    Q = 1e-6
    
    while t < tend:
        unew = u.copy()
        sol.append(u[1:-1, 1:-1])

         # Advection (Upwind Scheme)
        unew[1:-1, 1:-1] -= cfl_x * (u[1:-1, 1:-1] - u[1:-1, :-2])
        unew[1:-1, 1:-1] -= cfl_y * (u[1:-1, 1:-1] - u[:-2, 1:-1])
    
        # Diffusion (Central Differencing)
        unew[1:-1, 1:-1] += diff_x * (u[1:-1, 2:] - 2*u[1:-1, 1:-1] + u[1:-1, :-2])
        unew[1:-1, 1:-1] += diff_y * (u[2:, 1:-1] - 2*u[1:-1, 1:-1] + u[:-2, 1:-1])

        # Source Term
        unew[source_x, source_y] += Q * dt

        # Additional Source Points (forming a small area)
        offsets = [(-1, -1), (-1, 1), (1, -1), (1, 1), (-1, 0), (1, 0), (0, -1), (0, 1)]
        for dx, dy in offsets:
            unew[source_x + dx, source_y + dy] += Q * dt

        u = unew
        t += dt

    sol = np.transpose(sol, (1, 2, 0))
    return np.array(sol)

In [2]:
from joblib import Parallel, delayed
import numpy as np
import time as time

nx, ny= 51, 51  # Grid points
Lx, Ly = 5000, 5000  # Domain size in meters
x = np.linspace(-2500, 2500, nx)  # Centered at (0,0)
y = np.linspace(-2500, 2500, ny)
n = 50
cx, cy = np.random.RandomState().uniform(0, 10, n), np.random.RandomState().uniform(0, 10, n)
sx, sy = np.random.RandomState().uniform(0, 1, n), np.random.RandomState().uniform(0, 1, n)
num_cores = -1

start_time = time.time()
results = Parallel(n_jobs=num_cores)(
    delayed(run_simulation_2D)(x, y, nx, ny, Lx, Ly, cx[i], cy[i], sx[i], sy[i])
    for i in range(n)
)
end_time = time.time()

print(f"Simulation took: {end_time-start_time}")

Simulation took: 2.4160685539245605


## Applying Distance Metrics

We try to implement the same way as we have for the 1D problem, and adjust if there are any issues.

Because the shape of the results would be in 4D (n, Nx, Ny, time), it would be infeasible to try and solve everything all at once.

However, parallelisation can still be utilised.

In [3]:
import numpy as np 
from scipy.spatial.distance import pdist, squareform, cdist

In [4]:
observed = np.load("test.npy")

In [5]:
len(results)

50

The distances should be modified such that it computes the distance **for each spatial location** across time first (outputting a 51x51 matrix), with each (i, j) representating the distance at that point, and then output an average (?).

The original distance metrics were desgined so that it computes the distance between each column.
- This is because each column represented one solution.

### Wasserstein

#### Original

In [6]:
def wasserstein_distance(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> float:
    # Mean Difference between simulated and observed
    simulated_sorted = np.sort(simulated_sample, axis=0)
    observed_sorted = np.sort(observed_sample, axis=0)
    distance = np.mean(np.abs(simulated_sorted - observed_sorted), axis=0)

    return distance

#### Modified

In [7]:
def wasserstein_distance_3D(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> np.ndarray:
    """
    Compute the Wasserstein distance between two (51, 51, 1200) shaped arrays 
    along the time dimension.

    Returns a (51, 51) array of distances for each spatial location.
    """
    # Sort along the time axis (axis=2)
    simulated_sorted = np.sort(simulated_sample, axis=2)
    observed_sorted = np.sort(observed_sample, axis=2)

    # Compute the mean absolute difference along the time axis
    distance = np.mean(np.abs(simulated_sorted - observed_sorted), axis=2)

    return distance 

In [8]:
wass = Parallel(n_jobs=num_cores)(
    delayed(wasserstein_distance_3D)(results[i], observed)
    for i in range(n)
)

### Energy

I expect that energy, MMD and KLD will take a lot longer because of the nature of the $O(n^2)$ nature of the metrics. Where the need to calculate pairwise distances between 2D matricies will be computationally difficult.

#### Original

In [6]:
def energy_dist(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> float:
    ncol = simulated_sample.shape[1]

    mean_dist_XY = np.empty(ncol)  # Mean distances between columns
    mean_dist_XX = np.empty(ncol)  # Mean distances within array1
    mean_dist_YY = np.empty(ncol)  # Mean distances within array2

    for i in range(ncol):
        mean_dist_XX[i] = np.mean(squareform(pdist(simulated_sample[:, i, np.newaxis], metric='euclidean')))
        mean_dist_YY[i] = np.mean(squareform(pdist(observed_sample[:, i, np.newaxis], metric='euclidean')))
        mean_dist_XY[i] = np.mean(cdist(simulated_sample[:, i, np.newaxis], observed_sample[:, i, np.newaxis], metric='euclidean'))

    # Calculate the energy distances for each column in a vectorized way
    energy_distances = 2 * mean_dist_XY - mean_dist_XX - mean_dist_YY

    return energy_distances

#### Modified

To apply energy distance for our problem, we have to calculate energy distance at each [i, j] component of the matrix. 

Each [i, j] component has t=1200 values. i.e Each grid is a curve of its own. So in total we have 51x51=2601 curves.

So we need to calculate the energy distance 2601 times, and then return an average of energy distance?
- May be concerns because of domain size, maybe we only care about the middle bit, or the max energy distance, etc.

In [33]:
Nx, Ny, t = results[0].shape
obs_pairwise = []

for i in range(Nx):
    for j in range(Ny):
        obs_ij = observed[i, j, :].reshape(-1,1) # We now have an array of size 1200, then reshaped to be (1200, 1)
        obs_pairwise.append(np.mean(pdist(obs_ij)))

In [37]:
sum(np.array(obs_pairwise) > 0)

0

In [40]:
observed[0].shape

(51, 1200)

In [17]:
def energy_dist_3D(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> np.ndarray:
    Nx, Ny, T = simulated_sample.shape  # (51, 51, 1200)
    energy_distances = np.empty((Nx, Ny))  # Store distance for each (x, y)

    for i in range(Nx):
        for j in range(Ny):
            # Extract time series at (i, j) location
            sim_series = simulated_sample[i, j, :].reshape(-1, 1)  # (1200, 1)
            obs_series = observed_sample[i, j, :].reshape(-1, 1)  # (1200, 1)

            # Compute mean pairwise distances
            mean_dist_XX = np.mean(squareform(pdist(sim_series, metric='euclidean')))
            mean_dist_YY = np.mean(squareform(pdist(obs_series, metric='euclidean')))
            mean_dist_XY = np.mean(cdist(sim_series, obs_series, metric='euclidean'))

            # Compute energy distance
            energy_distances[i, j] = 2 * mean_dist_XY - mean_dist_XX - mean_dist_YY

    return energy_distances

In [16]:
energy_dist_3D(results[0], observed)

array([[1.96649462e-12, 6.07211140e-12, 1.79061042e-11, ...,
        1.79061042e-11, 6.07211140e-12, 1.96649462e-12],
       [7.68599312e-13, 2.57921208e-12, 8.23784132e-12, ...,
        8.23784132e-12, 2.57921208e-12, 7.68599312e-13],
       [2.65328094e-13, 9.77496878e-13, 3.41453778e-12, ...,
        3.41453778e-12, 9.77496878e-13, 2.65328094e-13],
       ...,
       [5.78720710e-92, 7.17956526e-89, 8.65213478e-86, ...,
        1.20644628e-15, 1.78437511e-15, 2.57310604e-15],
       [1.12597471e-92, 1.37530226e-89, 1.63048833e-86, ...,
        2.39120304e-16, 3.58553114e-16, 5.23815539e-16],
       [2.13481262e-93, 2.56917679e-90, 2.99874718e-87, ...,
        4.61475461e-17, 7.01032529e-17, 1.03685704e-16]])

#### Optimising the distance

Key points to optimise:
- **Avoid the double loop**: Because the output is the average (?), it will work as long as the indicies between the two input arrays match.
- **Pairwise distances**: Is it possible to not calculate the pairwise distance between observed and itself (YY) everytime?

In [13]:
observed[0, 0,:].shape
# squareform(pdist(observed), axis=1)

(1200,)

In [None]:
def energy_dist_3D(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> np.ndarray:
    # Nx, Ny, T = simulated_sample.shape  # (51, 51, 1200)

    # # Reshape to (Nx*Ny, T) to handle all spatial locations at once
    # sim_reshaped = simulated_sample.reshape(Nx * Ny, T)
    # obs_reshaped = observed_sample.reshape(Nx * Ny, T)

    # # Compute pairwise distances *only across the time axis* (T)
    # for i in range(Nx * Ny):
    #     mean_dist_XX = np.mean(squareform(pdist(sim_reshaped[i, :, None])), axis=1)
    #     mean_dist_YY = np.mean(squareform(pdist(obs_reshaped[i, :, None])), axis=1)
    #     mean_dist_XY = np.mean(cdist(sim_reshaped[i, :, None], obs_reshaped[i, :, None]), axis=1)

    #     # energy_distance
                                        
    # mean_dist_XY = np.mean([cdist(sim_reshaped[i, :, None], obs_reshaped[i, :, None], metric='euclidean') for i in range(Nx * Ny)], axis=1)

    # # Compute energy distance
    # energy_distances = 2 * mean_dist_XY - mean_dist_XX - mean_dist_YY

    # Reshape back to (51, 51)
    return energy_distances.reshape(Nx, Ny)

In [40]:
# def energy_dist_3D(simulated_sample: np.ndarray, observed_sample: np.ndarray) -> np.ndarray:
#     Nx, Ny, T = simulated_sample.shape  # (51, 51, 1200)

#     # Reshape to (Nx*Ny, T) to handle all spatial locations at once
#     sim_reshaped = simulated_sample.reshape(Nx * Ny, T)
#     obs_reshaped = observed_sample.reshape(Nx * Ny, T)

#     # Compute pairwise distances *only across the time axis* (T)
#     for i in range(Nx * Ny):
#         mean_dist_XX = np.mean(squareform(pdist(sim_reshaped[i, :, None])), axis=1)
#         mean_dist_YY = np.mean(squareform(pdist(obs_reshaped[i, :, None])), axis=1)
#         mean_dist_XY = np.mean(cdist(sim_reshaped[i, :, None], obs_reshaped[i, :, None]), axis=1)

#         # energy_distance
                                        
#     mean_dist_XY = np.mean([cdist(sim_reshaped[i, :, None], obs_reshaped[i, :, None], metric='euclidean') for i in range(Nx * Ny)], axis=1)

#     # Compute energy distance
#     energy_distances = 2 * mean_dist_XY - mean_dist_XX - mean_dist_YY

#     # Reshape back to (51, 51)
#     return energy_distances.reshape(Nx, Ny)

In [41]:
energy_dist_3D_optimized(results[0], observed)

array([[-1.77843557e-06, -1.77846801e-06, -1.77856557e-06, ...,
        -1.77856557e-06, -1.77846801e-06, -1.77843557e-06],
       [-1.77842657e-06, -1.77844027e-06, -1.77848557e-06, ...,
        -1.77848557e-06, -1.77844027e-06, -1.77842657e-06],
       [-1.77842290e-06, -1.77842812e-06, -1.77844678e-06, ...,
        -1.77844678e-06, -1.77842812e-06, -1.77842290e-06],
       ...,
       [-1.77842107e-06, -1.77842107e-06, -1.77842107e-06, ...,
        -1.77842098e-06, -1.77842093e-06, -1.77842085e-06],
       [-1.77842107e-06, -1.77842107e-06, -1.77842107e-06, ...,
        -1.77842105e-06, -1.77842104e-06, -1.77842102e-06],
       [-1.77842107e-06, -1.77842107e-06, -1.77842107e-06, ...,
        -1.77842106e-06, -1.77842106e-06, -1.77842106e-06]])