In [143]:
from sklearn.neighbors import NearestNeighbors
from scipy.ndimage import uniform_filter1d
from sklearn.cluster import DBSCAN
import numpy as np
import geopandas as gpd 
import os
import pandas as pd 
from shapely.ops import nearest_points
# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [144]:
sample_size = 3

In [145]:
def polygon_distance(polygon1, polygon2):
    # Calculate the minimum distance between two polygons
    point1, point2 = nearest_points(polygon1, polygon2)
    return point1.distance(point2)

def compute_distance_matrix(polygons):
    # Create a distance matrix between all polygons
    num_polygons = len(polygons)
    distance_matrix = np.zeros((num_polygons, num_polygons))
    
    for i in range(num_polygons):
        for j in range(i + 1, num_polygons):
            distance_matrix[i, j] = polygon_distance(polygons[i], polygons[j])
            distance_matrix[j, i] = distance_matrix[i, j]  # Symmetry
    
    return distance_matrix

In [146]:
data_dir = r'D:\Projects\superparcels\data\Rural\Crook_OR'
output_dir = r'D:\Projects\superparcels\data\urban\outputs\dmatrix'
parcels = gpd.read_file(os.path.join(data_dir, 'sp_sample_41013_cluster_canidates.shp'))
utm = parcels.estimate_utm_crs().to_epsg()

In [147]:
parcels = parcels.to_crs(epsg=utm)  

In [148]:
parcels['OWNER'].value_counts()

OWNER
BRADLEYSANTUCCI                        24
CONNIE MHATFIELD                       18
R HORTON INCD R HORTON INC-PORTLAND    18
JOANNWEAVER                            14
STEPHEN TGILLEN                        14
                                       ..
IRA GESSOE                              2
MARY DMAYFIELD                          2
SUSAN KCRAWFORD                         2
DUSTINCOLLINS                           2
IMPROVEMENTIDLEWAY                      2
Name: count, Length: 1226, dtype: int64

In [149]:
unique_owners = parcels['OWNER'].unique()
print('Number of unique owners:', len(unique_owners))   

Number of unique owners: 1226


In [169]:
clustered_parcel_data = gpd.GeoDataFrame()
single_parcel_data = gpd.GeoDataFrame()
cluster_counts_dict = {}    
for owner in unique_owners:
    if owner != 'L RKOPCINSKI':
        continue
    
    owner_parcels = parcels[parcels['OWNER'] == owner]
    polygons = owner_parcels['geometry'].to_list()
    distance_matrix = compute_distance_matrix(polygons)
    
    if distance_matrix.shape[0] < 3: # only two parcels
        continue
    
    if np.all(distance_matrix == 0):
        min_valid_distance = 3
        optimal_distance = 3
    else:
        min_valid_distance = np.round(np.min(distance_matrix[distance_matrix > 0]))

        # Assuming distance_matrix is the precomputed distance matrix
        neighbors = NearestNeighbors(n_neighbors=sample_size, metric='precomputed')
        neighbors_fit = neighbors.fit(distance_matrix)

        try:
            distances, indices = neighbors_fit.kneighbors(distance_matrix)
        except ValueError:
            print(f'Error: Not enough samples in {owner}')
            print(distance_matrix)
            sys.exit()

        # Sort distances to the k-th nearest neighbor
        sorted_distances = np.sort(distances[:, sample_size-1])
        smooth_dist = uniform_filter1d(sorted_distances, size=5)
        difference = np.diff(smooth_dist)
        difference2 = np.diff(difference)
        elbow_index = np.argmax(difference2) + 1
        optimal_distance = np.round(sorted_distances[elbow_index])
        if optimal_distance == 0: # adjacent parcels
                optimal_distance = 1

        distance_cap = 250
        if optimal_distance > distance_cap:
            optimal_distance = distance_cap
            
        #print(f'Optimal distance: {optimal_distance}')
    
    dbscan = DBSCAN(eps=optimal_distance, min_samples=sample_size, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)
    owner_parcels['cluster'] = clusters 
    owner_parcels['area'] = owner_parcels['geometry'].area
    counts = owner_parcels['cluster'].value_counts()
    
    #single_parcel_clusters = counts[counts == 1].index
    #low_parcel_clusters = counts[counts < 3].index
    outliers = counts[counts.index == -1].index
    single_parcel_filter_ids = set(list(outliers))
        
    single_parcel_filter = owner_parcels[owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    single_parcel_data = pd.concat([single_parcel_data, single_parcel_filter], ignore_index=True)
    
    cluster_filter = owner_parcels[~owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    if len(cluster_filter) > 0:
        cluster_counts_dict[owner] = sum(counts.to_list())
        cluster_filter['buff_dist'] = optimal_distance
        clustered_parcel_data = pd.concat([clustered_parcel_data, cluster_filter], ignore_index=True)
    #print('______________________________________________________________________________________')

    
        

    

Example of Owner containing two parcels with their respective distances to eachother (meters)

In [170]:
optimal_distance

12.0

In [171]:
distances

array([[  0.        ,  12.19161688, 121.90883318],
       [  0.        ,  12.19161688, 121.90818142],
       [  0.        ,   9.39928466,  12.19138101],
       [  0.        ,  11.61237218,  12.19138101],
       [  0.        ,   9.39928466,  11.61237218]])

In [172]:
sorted_distances

array([ 11.61237218,  12.19138101,  12.19138101, 121.90818142,
       121.90883318])

In [163]:
difference

array([-664.96898359, -687.77856297,  -22.80957938])

In [164]:
clustered_parcel_data

In [126]:
# create cluster ID
clustered_parcel_data['cluster_ID'] = clustered_parcel_data['OWNER'] + '_' + clustered_parcel_data['cluster'].astype(str)
single_parcel_data['cluster_ID'] = single_parcel_data['OWNER'] + '_' + single_parcel_data['cluster'].astype(str)

In [127]:
clustered_parcel_data['count'] = clustered_parcel_data['OWNER'].map(cluster_counts_dict)


In [128]:
parcel_dissolve = clustered_parcel_data.dissolve(by='cluster_ID').reset_index()

In [129]:
parcel_dissolve

Unnamed: 0,cluster_ID,geometry,FIPS,OWNER,duplicate_,duplicat_1,classifica,classifi_1,cluster,area,buff_dist,count
0,L RKOPCINSKI_0,"MULTIPOLYGON (((706935.316 4928108.743, 706687...",41013,L RKOPCINSKI,1,0,Class2: Duplicate Owner,2,0,199568.267682,122.0,5


In [130]:
# Define a function to buffer the geometry by its buff_dist
def buffer_geometry(geometry, buff_dist):
    return geometry.buffer(buff_dist)

# Use map to apply the buffer_geometry function to each geometry and buff_dist
parcel_dissolve['geometry'] = list(map(buffer_geometry, parcel_dissolve['geometry'], parcel_dissolve['buff_dist']))
parcel_dissolve['geometry'] = list(map(buffer_geometry, parcel_dissolve['geometry'], -parcel_dissolve['buff_dist']))
parcel_dissolve = parcel_dissolve.explode(ignore_index=True)
parcel_dissolve['sp_id'] = parcel_dissolve['cluster_ID'] + "_" + parcel_dissolve.groupby('cluster_ID').cumcount().astype(str) 

In [131]:
parcel_dissolve.to_file(os.path.join(data_dir, 'parcel_dissolve.shp'))

In [53]:
parcel_dissolve['area'] = parcel_dissolve['geometry'].area

In [54]:
parcel_dissolve.groupby('cluster_ID')['area'].mean().sort_values(ascending=False)

cluster_ID
COLEMANFOLEY_0    1.592413e+07
Name: area, dtype: float64

In [40]:
mean_area = parcel_dissolve.groupby('cluster_ID')['area'].mean()
super_parcel_ids = mean_area[mean_area > 100000].index
super_parcels = parcel_dissolve[parcel_dissolve['cluster_ID'].isin(super_parcel_ids)]
super_parcels = super_parcels[['cluster_ID', 'OWNER', 'area', 'buff_dist', 'geometry']]

In [41]:
# Define a function to buffer the geometry by its buff_dist
def buffer_geometry(geometry, buff_dist):
    return geometry.buffer(buff_dist)

# Use map to apply the buffer_geometry function to each geometry and buff_dist
super_parcels['geometry'] = list(map(buffer_geometry, super_parcels['geometry'], super_parcels['buff_dist']))
super_parcels['geometry'] = list(map(buffer_geometry, super_parcels['geometry'], -super_parcels['buff_dist']))


In [29]:
super_parcels['geometry'] = super_parcels['geometry'].buffer(dbscan_distance)
super_parcels['geometry'] = super_parcels['geometry'].buffer(-dbscan_distance)


In [42]:
super_parcels.to_file(os.path.join(data_dir, 'super_parcels_rbuff_var.shp'))