In [18]:
from sklearn.cluster import DBSCAN, HDBSCAN
import numpy as np
import geopandas as gpd 
import os
import pandas as pd 
from shapely import concave_hull, convex_hull, segmentize, minimum_rotated_rectangle
from shapely.ops import nearest_points
# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [19]:
dbscan_distance = 50
density_threshold = 10
concave_ratio = 0.7

In [20]:
def polygon_distance(polygon1, polygon2):
    # Calculate the minimum distance between two polygons
    point1, point2 = nearest_points(polygon1, polygon2)
    return point1.distance(point2)

def compute_distance_matrix(polygons):
    # Create a distance matrix between all polygons
    num_polygons = len(polygons)
    distance_matrix = np.zeros((num_polygons, num_polygons))
    
    for i in range(num_polygons):
        for j in range(i + 1, num_polygons):
            distance_matrix[i, j] = polygon_distance(polygons[i], polygons[j])
            distance_matrix[j, i] = distance_matrix[i, j]  # Symmetry
    
    return distance_matrix

In [21]:
data_dir = r'D:\Projects\superparcels\data\urban'
output_dir = r'D:\Projects\superparcels\data\urban\outputs\dmatrix'
parcels = gpd.read_file(os.path.join(data_dir, 'sp_sample_08013_cluster_canidates.shp'))
utm = parcels.estimate_utm_crs().to_epsg()

In [22]:
parcels = parcels.to_crs(epsg=utm)  

In [23]:
unique_owners = parcels['OWNER'].unique()
print('Number of unique owners:', len(unique_owners))   

Number of unique owners: 1396


In [24]:
parcels['OWNER'].value_counts()

OWNER
STEPHEN DTEBO           25
HENRY PVELLANDI         14
CHRISTOPHER JWALKER     13
DAVID NLARSON           13
SOTERIOSPALMOS           9
                        ..
MICHAEL RHOWARD          1
MARCPATTERSON            1
HEATHERDWIGHT            1
MANOUCHEHRZIRAKZADEH     1
DAVID WPAULE             1
Name: count, Length: 1396, dtype: int64

In [36]:
clustered_parcel_data = gpd.GeoDataFrame()
single_parcel_data = gpd.GeoDataFrame()
for owner in unique_owners:
    print(f'OWNER: {owner}')
    owner_parcels = parcels[parcels['OWNER'] == owner]
    polygons = owner_parcels['geometry'].to_list()
    distance_matrix = compute_distance_matrix(polygons)

    print(distance_matrix)

    break


    dbscan = DBSCAN(eps=dbscan_distance, min_samples=2, metric='precomputed')
    clusters = dbscan.fit_predict(distance_matrix)
    owner_parcels['cluster'] = clusters 
    counts = owner_parcels['cluster'].value_counts()
    #print(f'Cluster Counts: {counts}')
    single_parcel_clusters = counts[counts == 1].index
    single_parcel_outliers = counts[counts.index == -1].index
    single_parcel_filter_ids = list(single_parcel_clusters) + list(single_parcel_outliers)
        
    single_parcel_filter = owner_parcels[owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    single_parcel_data = pd.concat([single_parcel_data, single_parcel_filter], ignore_index=True)
    
    cluster_filter = owner_parcels[(~owner_parcels['cluster'].isin(single_parcel_clusters))&(owner_parcels['cluster'] != -1)]
    clustered_parcel_data = pd.concat([clustered_parcel_data, cluster_filter], ignore_index=True)
    print('______________________________________________________________________________________')
    

    

OWNER: MARIJETERELLEN
[[   0.         3593.54426338]
 [3593.54426338    0.        ]]


Example of Owner containing two parcels with their respective distances to eachother (meters)

In [26]:
# create cluster ID
clustered_parcel_data['cluster_ID'] = clustered_parcel_data['OWNER'] + '_' + clustered_parcel_data['cluster'].astype(str)
single_parcel_data['cluster_ID'] = single_parcel_data['OWNER'] + '_' + single_parcel_data['cluster'].astype(str)

In [27]:
parcel_dissolve = clustered_parcel_data.dissolve(by='cluster_ID').reset_index()

In [28]:
super_parcels = parcel_dissolve.copy()

In [29]:
# densify the super parcels
super_parcels['geometry'] = super_parcels['geometry'].apply(lambda x: segmentize(x, density_threshold))

In [30]:
super_parcels['geometry'] = super_parcels['geometry'].apply(lambda x: concave_hull(x, ratio=concave_ratio))

In [31]:

for idx, parcel in super_parcels.iterrows():
    parcel_geom = gpd.GeoDataFrame(geometry=[parcel.geometry])
    parcel_id = parcel['cluster_ID']    
    other_sp = super_parcels.loc[super_parcels['cluster_ID'] != parcel_id]
    other_single = single_parcel_data.loc[single_parcel_data['cluster_ID'] != parcel_id]
    
    other_sp_union = gpd.GeoDataFrame(geometry=[other_sp.unary_union])
    other_single_union = gpd.GeoDataFrame(geometry=[other_single.unary_union])
    other_union = pd.concat([other_sp_union, other_single_union], ignore_index=True)

    parcel_clip = (gpd.overlay(parcel_geom, other_union, how='difference')
                    .explode(ignore_index=True)
                    .reset_index(drop=True))

    parcel_clip['cluster_ID'] = parcel_id
    parcel_clip['OWNER'] = parcel['OWNER']

    # drop correspnding row in super_parcels
    super_parcels = super_parcels[super_parcels['cluster_ID'] != parcel_id]
    super_parcels = pd.concat([super_parcels, parcel_clip], ignore_index=True) # add parcel clip to super_parcels
    
    


In [34]:
super_parcels = super_parcels.explode(ignore_index=True)
super_parcels['sp_ID'] = super_parcels['cluster_ID'] + "_" + super_parcels.groupby('cluster_ID').cumcount().astype(str) 

In [35]:
super_parcels[['sp_ID', 'cluster_ID', 'OWNER', 'geometry']].to_file(os.path.join(output_dir, f'sp_dbscan{dbscan_distance}-cr{concave_ratio}-dens{density_threshold}.shp'))

In [33]:
single_parcel_data.head(3)

Unnamed: 0,state_code,cnty_code,OWNER,std_addr,duplicate_,duplicat_1,duplicat_2,classifica,classifi_1,geometry,cluster,cluster_ID
0,8,13,MARIJETERELLEN,501 EVERGREEN AVE,1,0,0,"Class3: Duplicate Owner & Unique Address, Geom...",3,"POLYGON ((475278.750 4431268.298, 475278.750 4...",-1,MARIJETERELLEN_-1
1,8,13,MARIJETERELLEN,1235 BASELINE RD,1,0,0,"Class3: Duplicate Owner & Unique Address, Geom...",3,"POLYGON ((476412.821 4427818.727, 476412.769 4...",-1,MARIJETERELLEN_-1
2,8,13,JASONCOTRELL,640 LARAMIE BLVD,1,0,0,"Class3: Duplicate Owner & Unique Address, Geom...",3,"POLYGON ((475550.330 4435190.856, 475557.271 4...",-1,JASONCOTRELL_-1


In [None]:
single_parcel_data.to_file(os.path.join(output_dir, f'single_parcels.shp'))

In [101]:
super_parcels = gpd.read_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}.shp'))

In [None]:
super_parcels

In [94]:
super_parcels = super_parcels[['cluster_ID', 'OWNER', 'geometry']]

In [None]:
sp_join = gpd.sjoin(super_parcels, parcels)
sp_join

In [104]:
sp_join[['cluster_ID', 'OWNER_left', 'index_right', 'geometry']].to_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}_join.shp'))

In [None]:
cluster_to_parcel_ids = sp_join.groupby('cluster_ID')['index_right'].apply(list)
cluster_to_parcel_ids

Pick up here, remove multiple small sliver polygons, and then rerun the above code. add a condition to only grab matching ownership before dissolve the sp_geom. 

In [None]:
super_parcels.head(2)

In [None]:
final_super_parcels = gpd.GeoDataFrame()
for cluster_id, parcel_ids in cluster_to_parcel_ids.items():
    sp_geom = super_parcels[super_parcels['cluster_ID'] == cluster_id]
    
    
    for parcel_id in set(parcel_ids):
        print(f'{cluster_id} -> {parcel_id}')
        parcel_geom = parcels[parcels.index == parcel_id]
        if parcel_geom['OWNER'].values[0] == sp_join[sp_join['index_right'] == parcel_id]['OWNER_left'].values[0]:
            print('Owner match')
            sp_geom = pd.concat([sp_geom, parcel_geom], ignore_index=True)
            print('___')
        else:
            print('Owner mismatch')
            print('___')
            continue
        
    sp_geom_dissolve = sp_geom.dissolve().reset_index()[['cluster_ID', 'OWNER', 'geometry']]
    # drop cluster_id from super parcel
    super_parcels = super_parcels[super_parcels['cluster_ID'] != cluster_id]
    # add new super parcel
    super_parcels = pd.concat([super_parcels, sp_geom_dissolve], ignore_index=True)
    
    

In [114]:
super_parcels.to_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}_final.shp'))

In [None]:
single_parcel_clusters