# Import irregularly gridded reservoir models and resample them regularly with nearest neighbour

In [1]:
import numpy as np
import pandas as pd
import scipy.spatial
from tqdm.auto import tqdm

from annoy import AnnoyIndex
import random

### Create artificial, irregular model

In [2]:
# size of dataset
size = 100
#size = 120*166*105

In [3]:
# Assume a paramter of grid cells, named zone
zone = np.random.randint(0,64+1, size=size)
zone

array([47, 33, 10, 51, 16,  7, 15, 38, 16, 59, 17, 52, 46, 53, 43, 54, 52,
       47, 15, 29, 38, 61, 31, 45, 62, 46, 59, 44, 41,  6,  4, 26, 35, 53,
       47, 43, 22, 64, 19, 10, 31, 17, 23, 63, 34, 22, 14,  7, 15,  8, 27,
        8, 35, 45, 59,  2, 55, 40,  0, 42, 16, 38, 33, 17,  7, 36, 32, 60,
       37,  3,  2, 35, 11,  4, 15, 22, 26, 21, 16, 34, 17,  1, 35, 49, 52,
       41, 64, 30, 50, 36, 10, 29, 63, 14, 58, 10, 11, 46, 24, 48])

In [4]:
x = np.random.random(size=size) * 300
x[0:10]

array([ 58.33176053, 186.24550183, 213.81581259, 280.07745789,
        34.45024899,  16.23962987, 176.39815524,  15.59225876,
       277.14024396,  16.35417611])

In [5]:
y = np.random.random(size=size) * 400
y[0:10]

array([303.41082574,  91.67373937,  14.23229646, 200.01866935,
       372.66578874, 129.94735174, 333.67017496, 180.0979077 ,
        73.81213755, 267.36097253])

In [6]:
z = np.random.random(size=size) * 100
z[0:10]

array([30.62159707, 19.76450168, 77.34919676, 62.16576916, 64.18028825,
       46.00662348, 88.36554776, 86.28358408, 44.42652456, 90.54812522])

In [7]:
#300*400*100

In [8]:
# combine in a pandas dataframe for readability
data = pd.DataFrame.from_dict({'x': x, 'y': y, 'z': z, 'zone': zone})
data.head()

Unnamed: 0,x,y,z,zone
0,58.331761,303.410826,30.621597,47
1,186.245502,91.673739,19.764502,33
2,213.815813,14.232296,77.349197,10
3,280.077458,200.018669,62.165769,51
4,34.450249,372.665789,64.180288,16


In [9]:
# xarray representation, might be useful at some point
#xd = df.set_index(['x','y','z']).to_xarray()
#xd

## Create regular grid for lookup

In [10]:
rx = np.arange(0,300,30)
ry = np.arange(0,400,20)
rz = np.arange(0,100,10)

In [11]:
a,b,c = np.meshgrid(rx,ry,rz)

In [12]:
grid = pd.DataFrame.from_dict({'x': a.ravel(), 'y': b.ravel(), 'z': c.ravel()})
grid.head()

Unnamed: 0,x,y,z
0,0,0,0
1,0,0,10
2,0,0,20
3,0,0,30
4,0,0,40


### Develop nearest neighbour algorithm

In [13]:
# Check shapes
data.values.shape, grid.values.shape

((100, 4), (2000, 3))

In [14]:
def nearest_neighbour(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        dis (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    # calculate distances between all data points and grid points
    dist = scipy.spatial.distance.cdist(grid, data)
    
    # nan out all distances that are too large  (maybe use np.inf?)
    if threshold is not None:
        dist = np.where(dist < threshold, dist, np.nan)
    
    # get a mask of all "valid" grid cells
    mask = ~np.all(np.isnan(dist), axis=1)
    
    # get array of "valid" distance combinations
    valid_dist = dist[mask,:]
    
    # find minimum distance indices for each valid grid cell
    idx = np.nanargmin(valid_dist, axis=1)
    dis = np.nanmin(valid_dist, axis=1)
    
    return mask, idx, dis

In [15]:
def nn_vectorized(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    # calculate distances between all data points and grid points
    dist = scipy.spatial.distance.cdist(grid, data)
    
    # find minimum distance indices for each grid cell
    idx = np.argmin(dist, axis=1)
    
    # find minimum distances for each grid cell
    mindist = np.min(dist, axis=1)
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
    
    return mask, idx, mindist

In [16]:
def nn_elemwise(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    gs = grid.shape[0]
    
    idx = np.empty(gs, dtype=np.int)
    mindist = np.empty(gs)
    
    iteration = tqdm(np.arange(gs), 'Calculating distances...')
    for i in iteration:
        dist = scipy.spatial.distance.cdist([grid[i]], data)
        
        # find minimum distance indices for each grid cell
        idx[i] = np.argmin(dist)
    
        # find minimum distances for each grid cell
        mindist[i] = np.nanmin(dist)
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        tqdm.write('Creating and applying mask...')
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
            
    return mask, idx, mindist

In [46]:
def nn_spotify(data, grid, threshold=10, tree_number=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    ds = data.shape[0]
    gs = grid.shape[0]
    
    idx = np.empty(gs, dtype=np.int)
    mindist = np.empty(gs)
    
    data_lookup = AnnoyIndex(3, 'euclidean')  # dimensions and distance metric

    # add all data points to the annoy lookup object
    range_a = tqdm(np.arange(ds), 'Building data lookup...')  # track process
    for i in range_a:
        data_lookup.add_item(i, data[i])
    
    tqdm.write('Building trees... (That can take a while)')
    data_lookup.build(tree_number) # build lookup trees, afterwards items cannot be added anymore

    # start lookup for each grid point
    range_b = tqdm(np.arange(gs), 'Calculating distances...')    
    for i in range_b:
        result = data_lookup.get_nns_by_vector(grid[i], 1, include_distances=True)  # lookup of nearest neighbours for each grid point, get index and distance
        idx[i] = result[0][0]  # minimum distance index for each grid point
        mindist[i] = result[1][0]  # minimum distance for each grid point
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        tqdm.write('Creating and applying mask...')
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
           
    return mask, idx, mindist

In [18]:
threshold = 10

### Application Vectorized

In [19]:
mask, idx, dis = nn_vectorized(data.iloc[:,0:3].values, grid.values, threshold=threshold)
print('Mask: ', mask.shape, np.sum(mask))
print('Indices: ', idx.shape, np.max(idx))
print('Distances: ', dis.shape, np.max(dis))

Mask:  (2000,) 54
Indices:  (54,) 94
Distances:  (54,) 9.989882470233326


In [20]:
valid_grid = grid.loc[mask].copy()
valid_grid.head()

Unnamed: 0,x,y,z
24,60,0,40
44,120,0,40
78,210,0,80
113,30,20,30
114,30,20,40


In [21]:
valid_grid['zone'] = data.zone[idx].values
valid_grid['dis'] = dis
valid_grid.head(6)

Unnamed: 0,x,y,z,zone,dis
24,60,0,40,17,8.183178
44,120,0,40,16,7.524357
78,210,0,80,23,9.868706
113,30,20,30,1,8.958069
114,30,20,40,1,4.725459
167,180,20,70,58,7.753416


In [22]:
# check validity of treshhold
valid_grid.dis.max()

9.989882470233326

### Application Element-wise

In [23]:
mask2, idx2, dis2 = nn_elemwise(data.iloc[:,0:3].values, grid.values, threshold=threshold)
print('Mask: ', mask2.shape, np.sum(mask2))
print('Indices: ', idx2.shape, np.max(idx2))
print('Distances: ', dis2.shape, np.max(dis2))

HBox(children=(IntProgress(value=0, description='Calculating distances...', max=2000, style=ProgressStyle(desc…


Creating and applying mask...
Mask:  (2000,) 54
Indices:  (54,) 94
Distances:  (54,) 9.989882470233326


In [24]:
# check conformity of functions
np.all(mask == mask2), np.all(idx == idx2), np.all(dis == dis2)

(True, True, True)

### Application Spotify

In [25]:
mask3, idx3, dis3 = nn_spotify(data.iloc[:,0:3].values, grid.values, threshold=threshold, tree_number=10)
print('Mask: ', mask3.shape, np.sum(mask3))
print('Indices: ', idx3.shape, np.max(idx3))
print('Distances: ', dis3.shape, np.max(dis3))

HBox(children=(IntProgress(value=0, description='Building lookup...', style=ProgressStyle(description_width='i…


Building trees... (That can take a while)


HBox(children=(IntProgress(value=0, description='Calculating distances...', max=2000, style=ProgressStyle(desc…


Creating and applying mask...
Mask:  (2000,) 54
Indices:  (54,) 94
Distances:  (54,) 9.989895820617676


In [26]:
# check conformity of functions
np.all(mask == mask3)

True

In [27]:
np.all(idx == idx3)

True

In [28]:
np.all(np.round(dis, 3) == np.round(dis3, 3))

True

# Application to large data-sets

In [29]:
# size of dataset
size = 120*166*105

In [31]:
x = np.random.random(size=size) * 300
y = np.random.random(size=size) * 400
z = np.random.random(size=size) * 100
# Assume a paramter of grid cells, named zone
zone = np.random.randint(0,64+1, size=size)

In [32]:
#300*400*100

In [41]:
# combine in a pandas dataframe for readability
data = pd.DataFrame.from_dict({'x': x, 'y': y, 'z': z, 'zone': zone})
data.head(1)

Unnamed: 0,x,y,z,zone
0,110.411196,311.235695,96.59083,63


In [34]:
# xarray representation, might be useful at some point
#xd = df.set_index(['x','y','z']).to_xarray()
#xd

## Create regular grid for lookup

In [42]:
rx = np.arange(0,300,1)
ry = np.arange(0,400,1)
rz = np.arange(0,100,10)

In [43]:
a,b,c = np.meshgrid(rx,ry,rz)

In [50]:
grid = pd.DataFrame.from_dict({'x': a.ravel(), 'y': b.ravel(), 'z': c.ravel()})
grid.head(2)

Unnamed: 0,x,y,z
0,0,0,0
1,0,0,10


# Calculation with Spotify Annoy

In [48]:
mask4, idx4, dis4 = nn_spotify(data.iloc[:,0:3].values, grid.values, threshold=2, tree_number=10)

HBox(children=(IntProgress(value=0, description='Building data lookup...', max=2091600, style=ProgressStyle(de…


Building trees... (That can take a while)


HBox(children=(IntProgress(value=0, description='Calculating distances...', max=1200000, style=ProgressStyle(d…


Creating and applying mask...


In [49]:
print('Mask: ', mask4.shape, np.sum(mask4))
print('Indices: ', idx4.shape, np.max(idx4))
print('Distances: ', dis4.shape, np.max(dis4))

Mask:  (1200000,) 1184652
Indices:  (1184652,) 2091594
Distances:  (1184652,) 1.9999750852584839
