In [9]:
from sklearn.neighbors import NearestNeighbors
from scipy.ndimage import uniform_filter1d
from sklearn.cluster import DBSCAN
import numpy as np
import geopandas as gpd 
import os
import pandas as pd 
from shapely.ops import nearest_points
# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [11]:
sample_size = 3

In [3]:
def polygon_distance(polygon1, polygon2):
    # Calculate the minimum distance between two polygons
    point1, point2 = nearest_points(polygon1, polygon2)
    return point1.distance(point2)

def compute_distance_matrix(polygons):
    # Create a distance matrix between all polygons
    num_polygons = len(polygons)
    distance_matrix = np.zeros((num_polygons, num_polygons))
    
    for i in range(num_polygons):
        for j in range(i + 1, num_polygons):
            distance_matrix[i, j] = polygon_distance(polygons[i], polygons[j])
            distance_matrix[j, i] = distance_matrix[i, j]  # Symmetry
    
    return distance_matrix

In [43]:
data_dir = r'D:\Projects\superparcels\data\Urban\Alameda_CA'
output_dir = r'D:\Projects\superparcels\data\urban\outputs\dmatrix'
parcels = gpd.read_file(os.path.join(data_dir, 'sp_sample_06001_cluster_canidates.shp'))
utm = parcels.estimate_utm_crs().to_epsg()

In [44]:
parcels = parcels.to_crs(epsg=utm)  

In [45]:
parcels['OWNER'].value_counts()

OWNER
DAVID SBOYD        70
RAJNEESHSALWAN     60
FAYEZ EABBOUD      51
JOHN JSULLIVAN     49
DANILOMAYORGA      46
                   ..
DOUGLAS ABROWN      1
LAKSHMIKOSARAJU     1
YUNGLINGCHEN        1
MILANPETRENCIK      1
LILLIANHOWAN        1
Name: count, Length: 38644, dtype: int64

In [46]:
unique_owners = parcels['OWNER'].unique()
print('Number of unique owners:', len(unique_owners))   

Number of unique owners: 38644


In [52]:
clustered_parcel_data = gpd.GeoDataFrame()
single_parcel_data = gpd.GeoDataFrame()
for owner in unique_owners:
    print(f'OWNER: {owner}')

    owner_parcels = parcels[parcels['OWNER'] == owner]
    polygons = owner_parcels['geometry'].to_list()
    distance_matrix = compute_distance_matrix(polygons)

    if distance_matrix.ndim < sample_size:
        continue
    
    # get distance greater than 0 but minimum distance
    if np.all(distance_matrix == 0):
        min_valid_distance = 3
        
    else:
        min_valid_distance = np.round(np.min(distance_matrix[distance_matrix > 0]))
        print(f'Minimum valid distance: {min_valid_distance}')

    # Assuming distance_matrix is the precomputed distance matrix
    neighbors = NearestNeighbors(n_neighbors=sample_size, metric='precomputed')
    neighbors_fit = neighbors.fit(distance_matrix)
    
    distances, indices = neighbors_fit.kneighbors(distance_matrix)
    continue
    # Sort distances to the k-th nearest neighbor
    sorted_distances = np.sort(distances[:, sample_size-1])
    smooth_dist = uniform_filter1d(sorted_distances, size=10)
    difference = np.diff(smooth_dist)
    elbow_index = np.argmax(difference) + 1
    optimal_distance = np.round(smooth_dist[elbow_index])
    print(f'Optimal distance: {optimal_distance}')
    
    dbscan = DBSCAN(eps=optimal_distance, min_samples=sample_size, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)
    owner_parcels['cluster'] = clusters 
    owner_parcels['area'] = owner_parcels['geometry'].area
    counts = owner_parcels['cluster'].value_counts()
    
    #single_parcel_clusters = counts[counts == 1].index
    low_parcel_clusters = counts[counts < 3].index
    outliers = counts[counts.index == -1].index
    single_parcel_filter_ids = set(list(outliers) + list(low_parcel_clusters))
        
    single_parcel_filter = owner_parcels[owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    single_parcel_data = pd.concat([single_parcel_data, single_parcel_filter], ignore_index=True)
    
    cluster_filter = owner_parcels[~owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    if len(cluster_filter) > 0:
        cluster_filter['buff_dist'] = min_valid_distance
        clustered_parcel_data = pd.concat([clustered_parcel_data, cluster_filter], ignore_index=True)
    print('______________________________________________________________________________________')
    break        

    

OWNER: PATRICK JMONAGHAN
OWNER: PRADEEPCHIKKU
OWNER: CHARLENE AHAWKINS
OWNER: JUSTINKILBY
OWNER: LUCYANDRADE
OWNER: NORMANLEE
OWNER: JOHN DSMITH
OWNER: MAURICE EDANCE
OWNER: GLENN CWENIG
OWNER: TERESANAZARETH
OWNER: WEI LZHAO
OWNER: DUSTYGILLELAND
OWNER: PATRICK BBAILEY
OWNER: ROGER CTHOMPSON
OWNER: THOMAS PSULLIVAN
OWNER: GINASANFILIPPO
OWNER: KATHYLEE
OWNER: TIFFANYELAM
OWNER: JOHNHARRIS
OWNER: ESTELLA ETISCORNIA
OWNER: HELEN HYU
OWNER: JENNIFER ALEE
OWNER: SHARON DROGERS
OWNER: MARKATTARHA
OWNER: SHAHABLAAL
OWNER: CLINTJACKMAN
OWNER: SANDEEPKAUR
OWNER: FARHATSAFI
OWNER: JASLEENKAUR
OWNER: EVAN MJACKSON
OWNER: JASONISSA GKARKAR
OWNER: BENITO MDELGADOOLSON
OWNER: IONASPORGESKIRIAKOU
OWNER: ALFRED GBLOCH
OWNER: MY QHUYNH
OWNER: MINGJIANZHANG
OWNER: ANNEPERKINS
OWNER: PAMELA ERANDOLPH
OWNER: SHAKIR AKHAN
OWNER: BRANDENRENFRO
OWNER: WILLIAMPRASAD
OWNER: TONI KWILKERSON
OWNER: JENNIE TDANG
OWNER: JESUS OALCALA
OWNER: BECKY AYAMAGAMI
OWNER: SALVADORSANDOVAL
OWNER: CARL PVALDERAMA
OWNER: AN

KeyboardInterrupt: 

Example of Owner containing two parcels with their respective distances to eachother (meters)

In [51]:
distance_matrix.ndim

2

In [36]:
# create cluster ID
clustered_parcel_data['cluster_ID'] = clustered_parcel_data['OWNER'] + '_' + clustered_parcel_data['cluster'].astype(str)
single_parcel_data['cluster_ID'] = single_parcel_data['OWNER'] + '_' + single_parcel_data['cluster'].astype(str)

In [37]:
parcel_dissolve = clustered_parcel_data.dissolve(by='cluster_ID').reset_index()


In [38]:
parcel_dissolve['area'] = parcel_dissolve['geometry'].area

In [39]:
parcel_dissolve.groupby('cluster_ID')['area'].mean().sort_values(ascending=False)

cluster_ID
COLEMANFOLEY_0    1.592413e+07
Name: area, dtype: float64

In [40]:
mean_area = parcel_dissolve.groupby('cluster_ID')['area'].mean()
super_parcel_ids = mean_area[mean_area > 100000].index
super_parcels = parcel_dissolve[parcel_dissolve['cluster_ID'].isin(super_parcel_ids)]
super_parcels = super_parcels[['cluster_ID', 'OWNER', 'area', 'buff_dist', 'geometry']]

In [41]:
# Define a function to buffer the geometry by its buff_dist
def buffer_geometry(geometry, buff_dist):
    return geometry.buffer(buff_dist)

# Use map to apply the buffer_geometry function to each geometry and buff_dist
super_parcels['geometry'] = list(map(buffer_geometry, super_parcels['geometry'], super_parcels['buff_dist']))
super_parcels['geometry'] = list(map(buffer_geometry, super_parcels['geometry'], -super_parcels['buff_dist']))


In [29]:
super_parcels['geometry'] = super_parcels['geometry'].buffer(dbscan_distance)
super_parcels['geometry'] = super_parcels['geometry'].buffer(-dbscan_distance)


In [42]:
super_parcels.to_file(os.path.join(data_dir, 'super_parcels_rbuff_var.shp'))