# Import irregularly gridded reservoir models and resample them regularly with nearest neighbour

In [52]:
import numpy as np
import pandas as pd
import scipy.spatial
from tqdm.auto import tqdm

from annoy import AnnoyIndex
import random

### Create artificial, irregular model

In [53]:
# size of dataset
size = 100
#size = 120*166*105

In [54]:
# Assume a paramter of grid cells, named zone
zone = np.random.randint(0,64+1, size=size)
zone

array([54, 14, 21, 62,  6, 56, 36, 43, 46, 38, 61, 26,  9, 40, 39, 38, 62,
       44, 15,  0, 32, 51, 19, 17, 15, 61, 45, 20, 53, 31, 54,  4, 41, 33,
       28, 58,  1, 14, 52, 27, 20, 45, 19, 38, 30, 10, 26, 29, 14,  8, 58,
       57, 31, 51, 58, 15, 57, 54, 16,  4, 64,  6, 32, 32, 44, 60, 36, 62,
       28,  4, 26, 34, 25, 32, 32, 14,  1, 58, 48, 59, 29, 28, 18, 29, 22,
       53, 61, 54, 42, 23, 59, 50, 45, 22, 57, 44, 42,  6, 52, 49])

In [55]:
x = np.random.random(size=size) * 300
x[0:10]

array([ 63.41012257,  44.51520036,  70.66870286, 100.02266983,
       192.83227372, 132.33777307,  70.40829849, 258.19810291,
       189.37478773,  28.96923354])

In [56]:
y = np.random.random(size=size) * 400
y[0:10]

array([291.00648038, 385.03553051, 286.17714255,  82.55406012,
        68.09871713, 192.64648293, 334.5850094 , 332.75020959,
       131.86564703, 343.31476333])

In [57]:
z = np.random.random(size=size) * 100
z[0:10]

array([37.3066156 , 67.77520714, 35.56966046, 11.4868504 , 78.44643609,
       30.47484209,  7.95087942, 86.73956794,  9.6359521 , 90.71220981])

In [58]:
#300*400*100

In [59]:
# combine in a pandas dataframe for readability
data = pd.DataFrame.from_dict({'x': x, 'y': y, 'z': z, 'zone': zone})
data.head()

Unnamed: 0,x,y,z,zone
0,63.410123,291.00648,37.306616,54
1,44.5152,385.035531,67.775207,14
2,70.668703,286.177143,35.56966,21
3,100.02267,82.55406,11.48685,62
4,192.832274,68.098717,78.446436,6


In [60]:
# xarray representation, might be useful at some point
#xd = df.set_index(['x','y','z']).to_xarray()
#xd

## Create regular grid for lookup

In [61]:
rx = np.arange(0,300,30)
ry = np.arange(0,400,20)
rz = np.arange(0,100,10)

In [62]:
a,b,c = np.meshgrid(rx,ry,rz)

In [63]:
grid = pd.DataFrame.from_dict({'x': a.ravel(), 'y': b.ravel(), 'z': c.ravel()})
grid.head()

Unnamed: 0,x,y,z
0,0,0,0
1,0,0,10
2,0,0,20
3,0,0,30
4,0,0,40


### Develop nearest neighbour algorithm

In [64]:
# Check shapes
data.values.shape, grid.values.shape

((100, 4), (2000, 3))

In [65]:
def nearest_neighbour(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        dis (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    # calculate distances between all data points and grid points
    dist = scipy.spatial.distance.cdist(grid, data)
    
    # nan out all distances that are too large  (maybe use np.inf?)
    if threshold is not None:
        dist = np.where(dist < threshold, dist, np.nan)
    
    # get a mask of all "valid" grid cells
    mask = ~np.all(np.isnan(dist), axis=1)
    
    # get array of "valid" distance combinations
    valid_dist = dist[mask,:]
    
    # find minimum distance indices for each valid grid cell
    idx = np.nanargmin(valid_dist, axis=1)
    dis = np.nanmin(valid_dist, axis=1)
    
    return mask, idx, dis

In [66]:
def nn_vectorized(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    # calculate distances between all data points and grid points
    dist = scipy.spatial.distance.cdist(grid, data)
    
    # find minimum distance indices for each grid cell
    idx = np.argmin(dist, axis=1)
    
    # find minimum distances for each grid cell
    mindist = np.min(dist, axis=1)
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
    
    return mask, idx, mindist

In [67]:
def nn_elemwise(data, grid, threshold=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    gs = grid.shape[0]
    
    idx = np.empty(gs, dtype=np.int)
    mindist = np.empty(gs)
    
    iteration = tqdm(np.arange(gs), 'Calculating distances...')
    for i in iteration:
        dist = scipy.spatial.distance.cdist([grid[i]], data)
        
        # find minimum distance indices for each grid cell
        idx[i] = np.argmin(dist)
    
        # find minimum distances for each grid cell
        mindist[i] = np.nanmin(dist)
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        tqdm.write('Creating and applying mask...')
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
            
    return mask, idx, mindist

In [68]:
def nn_spotify(data, grid, threshold=10, tree_number=10):
    """Find nearest neighbour of a new grid-cell in a set of data-grid-cells
    
    Args:
        data (array): n x 3 array with x,y,z coordinates of irregular grid points
        grid (array): n x 3 array with x,y,z coordinates of regular grid points
        threshold (float): Maximum distance, within a neighbour is accepted as such
        
    Returns:
        mask (bool array): 1D mask defining the validity of grid cells dependent on threshold
        idx (int array): 1D array of size grid[mask], with inidces pointing to nearest point in data
        mindist (float array): 1D array of size grid[mask], with distances to indixed neighbour (for testing)        
    """
    
    ds = data.shape[0]
    gs = grid.shape[0]
    
    idx = np.empty(gs, dtype=np.int)
    mindist = np.empty(gs)
    
    data_lookup = AnnoyIndex(3, 'euclidean')  # dimensions and distance metric

    # add all data points to the annoy lookup object
    range_a = tqdm(np.arange(ds), 'Building data lookup...')  # track process
    for i in range_a:
        data_lookup.add_item(i, data[i])
    
    tqdm.write('Building trees... (That can take a while)')
    data_lookup.build(tree_number) # build lookup trees, afterwards items cannot be added anymore

    # start lookup for each grid point
    range_b = tqdm(np.arange(gs), 'Calculating distances...')    
    for i in range_b:
        result = data_lookup.get_nns_by_vector(grid[i], 1, include_distances=True)  # lookup of nearest neighbours for each grid point, get index and distance
        idx[i] = result[0][0]  # minimum distance index for each grid point
        mindist[i] = result[1][0]  # minimum distance for each grid point
    
    # get "valid" distances, indices and the mask to filter grid
    if threshold is not None:
        tqdm.write('Creating and applying mask...')
        mask = np.where(mindist < threshold, True, False)
        idx = idx[mask]
        mindist = mindist[mask]
           
    return mask, idx, mindist

In [69]:
threshold = 10

### Application Vectorized

In [70]:
mask, idx, dis = nn_vectorized(data.iloc[:,0:3].values, grid.values, threshold=threshold)
print('Mask: ', mask.shape, np.sum(mask))
print('Indices: ', idx.shape, np.max(idx))
print('Distances: ', dis.shape, np.max(dis))

Mask:  (2000,) 67
Indices:  (67,) 99
Distances:  (67,) 9.988325717317261


In [71]:
valid_grid = grid.loc[mask].copy()
valid_grid.head()

Unnamed: 0,x,y,z
133,90,20,30
169,180,20,90
196,270,20,60
197,270,20,70
340,120,60,0


In [72]:
valid_grid['zone'] = data.zone[idx].values
valid_grid['dis'] = dis
valid_grid.head(6)

Unnamed: 0,x,y,z,zone,dis
133,90,20,30,14,9.093656
169,180,20,90,57,9.914815
196,270,20,60,57,4.508971
197,270,20,70,57,8.971577
340,120,60,0,52,6.631212
341,120,60,10,52,8.145988


In [73]:
# check validity of treshhold
valid_grid.dis.max()

9.988325717317261

### Application Element-wise

In [74]:
mask2, idx2, dis2 = nn_elemwise(data.iloc[:,0:3].values, grid.values, threshold=threshold)
print('Mask: ', mask2.shape, np.sum(mask2))
print('Indices: ', idx2.shape, np.max(idx2))
print('Distances: ', dis2.shape, np.max(dis2))

HBox(children=(IntProgress(value=0, description='Calculating distances...', max=2000, style=ProgressStyle(desc…


Creating and applying mask...
Mask:  (2000,) 67
Indices:  (67,) 99
Distances:  (67,) 9.988325717317261


In [75]:
# check conformity of functions
np.all(mask == mask2), np.all(idx == idx2), np.all(dis == dis2)

(True, True, True)

### Application Spotify

In [84]:
mask3, idx3, dis3 = nn_spotify(data.iloc[:,0:3].values, grid.values, threshold=threshold, tree_number=30)
print('Mask: ', mask3.shape, np.sum(mask3))
print('Indices: ', idx3.shape, np.max(idx3))
print('Distances: ', dis3.shape, np.max(dis3))

HBox(children=(IntProgress(value=0, description='Building data lookup...', style=ProgressStyle(description_wid…


Building trees... (That can take a while)


HBox(children=(IntProgress(value=0, description='Calculating distances...', max=2000, style=ProgressStyle(desc…


Creating and applying mask...
Mask:  (2000,) 67
Indices:  (67,) 99
Distances:  (67,) 9.988335609436035


In [85]:
# check conformity of functions
np.all(mask == mask3)

True

In [86]:
np.all(idx == idx3)

True

In [87]:
np.all(np.round(dis, 3) == np.round(dis3, 3))

True

# Application to large data-sets

### Test data

In [29]:
# size of dataset
size = 120*166*105

In [31]:
x = np.random.random(size=size) * 300
y = np.random.random(size=size) * 400
z = np.random.random(size=size) * 100
# Assume a paramter of grid cells, named zone
zone = np.random.randint(0,64+1, size=size)

In [32]:
#300*400*100

In [41]:
# combine in a pandas dataframe for readability
data = pd.DataFrame.from_dict({'x': x, 'y': y, 'z': z, 'zone': zone})
data.head(1)

Unnamed: 0,x,y,z,zone
0,110.411196,311.235695,96.59083,63


In [34]:
# xarray representation, might be useful at some point
#xd = df.set_index(['x','y','z']).to_xarray()
#xd

### Test regular grid for lookup

In [42]:
rx = np.arange(0,300,1)
ry = np.arange(0,400,1)
rz = np.arange(0,100,10)

In [43]:
a,b,c = np.meshgrid(rx,ry,rz)

In [50]:
grid = pd.DataFrame.from_dict({'x': a.ravel(), 'y': b.ravel(), 'z': c.ravel()})
grid.head(2)

Unnamed: 0,x,y,z
0,0,0,0
1,0,0,10


# Calculation with Spotify Annoy

In [48]:
mask4, idx4, dis4 = nn_spotify(data.iloc[:,0:3].values, grid.values, threshold=2, tree_number=10)

HBox(children=(IntProgress(value=0, description='Building data lookup...', max=2091600, style=ProgressStyle(de…


Building trees... (That can take a while)


HBox(children=(IntProgress(value=0, description='Calculating distances...', max=1200000, style=ProgressStyle(d…


Creating and applying mask...


In [51]:
print('Mask: ', mask4.shape, np.sum(mask4))
print('Indices: ', idx4.shape, np.max(idx4))
print('Distances: ', dis4.shape, np.max(dis4))

Mask:  (1200000,) 1184652
Indices:  (1184652,) 2091594
Distances:  (1184652,) 1.9999750852584839
