In [1]:
from sklearn.cluster import DBSCAN
import numpy as np
import geopandas as gpd 
import os
import pandas as pd 
from shapely import concave_hull
from alphashape import alphashape as alpha
# ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [2]:
dbscan_distance = 50

In [3]:
data_dir = r'D:\Projects\superparcels\data\urban'
parcels = gpd.read_file(os.path.join(data_dir, 'sp_sample_08013_cluster_canidates.shp'))
utm = parcels.estimate_utm_crs().to_epsg()

In [4]:
parcels = parcels.to_crs(epsg=utm)  

In [5]:
parcels['point'] = parcels['geometry'].centroid

In [6]:
unique_owners = parcels['OWNER'].unique()
print('Number of unique owners:', len(unique_owners))   

Number of unique owners: 1516


In [7]:
parcels['OWNER'].value_counts()

OWNER
STEPHEN DTEBO          23
DAVID NLARSON          18
HENRY PVELLANDI        13
CHRISTOPHER JWALKER    13
TUCKER CBROCK           9
                       ..
JAMES ABRAY             2
JACOBBECK               2
JACLYNFLEMING           2
JACKMCCARTHY            2
ZHIWEIZHANG             2
Name: count, Length: 1516, dtype: int64

In [8]:
clustered_parcel_data = gpd.GeoDataFrame()
single_parcel_data = gpd.GeoDataFrame()
for owner in unique_owners:
    print(f'OWNER: {owner}')
    owner_parcels = parcels[parcels['OWNER'] == owner]
    owner_coords = [x.coords[0] for x in owner_parcels['point'].to_list()]
    dbscan = DBSCAN(eps=dbscan_distance, min_samples=2)
    clusters = dbscan.fit_predict(owner_coords)
    owner_parcels['cluster'] = clusters 
    counts = owner_parcels['cluster'].value_counts()
    #print(f'Cluster Counts: {counts}')
    single_parcel_clusters = counts[counts == 1].index
    single_parcel_outliers = counts[counts.index == -1].index
    single_parcel_filter_ids = list(single_parcel_clusters) + list(single_parcel_outliers)

    single_parcel_filter = owner_parcels[owner_parcels['cluster'].isin(single_parcel_filter_ids)]
    single_parcel_data = pd.concat([single_parcel_data, single_parcel_filter], ignore_index=True)
    
    cluster_filter = owner_parcels[(~owner_parcels['cluster'].isin(single_parcel_clusters))&(owner_parcels['cluster'] != -1)]
    clustered_parcel_data = pd.concat([clustered_parcel_data, cluster_filter], ignore_index=True)
    print('______________________________________________________________________________________')
    

    

OWNER: 2004 JAMES THOMASCHRISTOL
______________________________________________________________________________________
OWNER: AARON RICHARDBROWN
______________________________________________________________________________________
OWNER: AARONGREENE
______________________________________________________________________________________
OWNER: ABISHEKMANMADHAN
______________________________________________________________________________________
OWNER: ADAM BENJAMINPALMER
______________________________________________________________________________________
OWNER: ADAM GHECHT
______________________________________________________________________________________
OWNER: ADELA FANGHEL
______________________________________________________________________________________
OWNER: ADELE GMAHLE
______________________________________________________________________________________
OWNER: ADELYN MUNCE REVJONES
______________________________________________________________________________________

In [9]:
single_parcel_data

Unnamed: 0,OWNER,state_code,cnty_code,std_addr,classifica,classifi_1,geometry,point,cluster
0,2004 JAMES THOMASCHRISTOL,8,13,1200 YARMOUTH AVE UNIT 231,"Duplicate Owner, Address & Geometry",1,"POLYGON ((475950.236 4434044.970, 475950.121 4...",POINT (475986.017 4434067.382),-1
1,2004 JAMES THOMASCHRISTOL,8,13,1200 YARMOUTH AVE UNIT 231,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476046.664 4434045.259, 476046.201 4...",POINT (476076.209 4434067.544),-1
2,2004 JAMES THOMASCHRISTOL,8,13,1200 YARMOUTH AVE UNIT 231,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476022.937 4434246.666, 476022.812 4...",POINT (475986.578 4434177.099),-1
3,2004 JAMES THOMASCHRISTOL,8,13,1200 YARMOUTH AVE UNIT 231,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476111.811 4434248.214, 476111.547 4...",POINT (476076.376 4434177.412),-1
4,AARON RICHARDBROWN,8,13,85 TIMBER LN,"Duplicate Owner & Address, Unique Geometry",0,"POLYGON ((482943.910 4428702.249, 482943.805 4...",POINT (482914.630 4428689.810),-1
...,...,...,...,...,...,...,...,...,...
3244,ZACHARY AKEIRN,8,13,4551 13TH ST # 3D,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476046.664 4434045.259, 476046.201 4...",POINT (476076.209 4434067.544),-1
3245,ZACHARY AKEIRN,8,13,4551 13TH ST # 3D,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476022.812 4434203.411, 476022.537 4...",POINT (475986.578 4434177.099),-1
3246,ZACHARY AKEIRN,8,13,4551 13TH ST # 3D,"Duplicate Owner, Address & Geometry",1,"POLYGON ((476111.547 4434176.167, 476111.296 4...",POINT (476076.376 4434177.412),-1
3247,ZHIWEIZHANG,8,13,1460 LEE HILL RD UNIT 1,"Duplicate Owner, Address & Geometry",1,"POLYGON ((479807.697 4427678.359, 479790.834 4...",POINT (479753.383 4427729.679),-1


In [10]:
# create cluster ID
clustered_parcel_data['cluster_ID'] = clustered_parcel_data['OWNER'] + '_' + clustered_parcel_data['cluster'].astype(str)
single_parcel_data['cluster_ID'] = single_parcel_data['OWNER'] + '_' + single_parcel_data['cluster'].astype(str)

In [11]:
clustered_parcel_data.shape

(792, 10)

In [12]:
parcel_dissolve = clustered_parcel_data.dissolve(by='cluster_ID').reset_index()

In [13]:
super_parcels = parcel_dissolve.copy()

In [14]:
def poly_to_points(poly):
    if poly.geom_type == 'Polygon':
        return poly.exterior.coords[:]
    elif poly.geom_type == 'MultiPolygon':
        coords = [p.exterior.coords[:] for p in poly.geoms]
        return [item for sublist in coords for item in sublist]
    else:
        raise ValueError('Unhandled geometry type: ' + repr(poly.geom_type))

In [15]:
super_parcels['points'] = super_parcels['geometry'].apply(poly_to_points)


In [16]:
def get_alphashape(points, alpha_val=None):
    if alpha_val is None:
        alpha_shape = alpha(points)
    else:
        alpha_shape = alpha(points, alpha_val)

    return alpha_shape

In [25]:
super_parcels['geometry'] = super_parcels['points'].apply(lambda x: get_alphashape(x, alpha_val=0.5))

In [31]:
super_parcels.geom_type.value_counts()

Polygon               173
GeometryCollection    133
MultiPolygon           11
Name: count, dtype: int64

In [32]:
alpha = super_parcels[(super_parcels['geometry'].geom_type != 'GeometryCollection') | (super_parcels['geometry'].is_empty)]

In [38]:
alpha = alpha[~alpha['geometry'].is_empty]

In [40]:
alpha.columns

Index(['cluster_ID', 'geometry', 'OWNER', 'state_code', 'cnty_code',
       'std_addr', 'classifica', 'classifi_1', 'point', 'cluster', 'points'],
      dtype='object')

In [41]:
alpha[['cluster_ID', 'OWNER', 'geometry']].to_file(os.path.join(data_dir, 'super_parcels_alpha_5.shp'))   

In [19]:

for idx, parcel in super_parcels.iterrows():
    parcel_geom = gpd.GeoDataFrame(geometry=[parcel.geometry])
    parcel_id = parcel['cluster_ID']    
    other_sp = super_parcels.loc[super_parcels['cluster_ID'] != parcel_id]
    other_union = gpd.GeoDataFrame(geometry=[other_sp.unary_union])
    parcel_clip = (gpd.overlay(parcel_geom, other_union, how='difference')
                    .explode(ignore_index=True)
                    .reset_index(drop=True))
    parcel_clip['cluster_ID'] = parcel_id
    parcel_clip['OWNER'] = parcel['OWNER']
    # drop correspnding row in super_parcels
    super_parcels = super_parcels[super_parcels['cluster_ID'] != parcel_id]
    # add parcel clip to super_parcels
    super_parcels = pd.concat([super_parcels, parcel_clip], ignore_index=True)
    


TypeError: `keep_geom_type` does not support GeometryCollection.

In [92]:
super_parcels[['cluster_ID', 'OWNER', 'geometry']].to_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}.shp'))

In [93]:
super_parcels.head(2)

Unnamed: 0,cluster_ID,geometry,OWNER,state_code,cnty_code,std_addr,classifica,classifi_1,point,cluster
0,ALAN HWATKINS_0,"POLYGON ((477530.083 4432596.327, 477532.080 4...",ALAN HWATKINS,,,,,,,
1,ALAN HWATKINS_0,"POLYGON ((477513.422 4432592.996, 477512.690 4...",ALAN HWATKINS,,,,,,,


In [101]:
super_parcels = gpd.read_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}.shp'))

In [102]:
super_parcels

Unnamed: 0,cluster_ID,OWNER,geometry
0,ANDREWLITTMAN_0,ANDREWLITTMAN,"POLYGON ((474947.053 4431262.111, 474947.037 4..."
1,ANTONIO JMARTINEZ_0,ANTONIO JMARTINEZ,"POLYGON ((477902.899 4432366.623, 477870.489 4..."
2,APPLERIDGEPARK_0,APPLERIDGEPARK,"POLYGON ((476159.054 4432799.466, 476142.059 4..."
3,ARTHUR CJENSEN_0,ARTHUR CJENSEN,"POLYGON ((479333.102 4431171.140, 479333.058 4..."
4,ARTHUR CJENSEN_0,ARTHUR CJENSEN,"POLYGON ((479336.040 4431177.235, 479344.972 4..."
...,...,...,...
214,WILLIAMACAR_0,WILLIAMACAR,"POLYGON ((480658.513 4429223.800, 480664.109 4..."
215,WILLIAMFARROW_0,WILLIAMFARROW,"POLYGON ((480656.664 4429161.837, 480647.153 4..."
216,WILLIAMMCINTYRE_0,WILLIAMMCINTYRE,"POLYGON ((478663.792 4431486.008, 478663.805 4..."
217,WILLIAMMCINTYRE_1,WILLIAMMCINTYRE,"POLYGON ((478722.787 4431451.454, 478722.800 4..."


In [94]:
super_parcels = super_parcels[['cluster_ID', 'OWNER', 'geometry']]

In [103]:
sp_join = gpd.sjoin(super_parcels, parcels)
sp_join

Unnamed: 0,cluster_ID,OWNER_left,geometry,index_right,OWNER_right,state_code,cnty_code,std_addr,classifica,classifi_1,point
0,ANDREWLITTMAN_0,ANDREWLITTMAN,"POLYGON ((474947.053 4431262.111, 474947.037 4...",182,ANDREWLITTMAN,8,13,3101 3RD ST,"Duplicate Owner & Address, Unique Geometry",0,POINT (474983.408 4431233.091)
0,ANDREWLITTMAN_0,ANDREWLITTMAN,"POLYGON ((474947.053 4431262.111, 474947.037 4...",183,ANDREWLITTMAN,8,13,3101 3RD ST,"Duplicate Owner & Address, Unique Geometry",0,POINT (474981.183 4431256.155)
1,ANTONIO JMARTINEZ_0,ANTONIO JMARTINEZ,"POLYGON ((477902.899 4432366.623, 477870.489 4...",259,ANTONIO JMARTINEZ,8,13,3673 HAZELWOOD CT,"Duplicate Owner & Address, Unique Geometry",0,POINT (477915.986 4432362.377)
1,ANTONIO JMARTINEZ_0,ANTONIO JMARTINEZ,"POLYGON ((477902.899 4432366.623, 477870.489 4...",3677,SUSAN HOLDENWALSH,8,13,3650 HAZELWOOD CT # G1,"Duplicate Owner & Address, Unique Geometry",0,POINT (477917.665 4432365.643)
1,ANTONIO JMARTINEZ_0,ANTONIO JMARTINEZ,"POLYGON ((477902.899 4432366.623, 477870.489 4...",260,ANTONIO JMARTINEZ,8,13,3673 HAZELWOOD CT,"Duplicate Owner & Address, Unique Geometry",0,POINT (477888.015 4432377.180)
...,...,...,...,...,...,...,...,...,...,...,...
217,WILLIAMMCINTYRE_1,WILLIAMMCINTYRE,"POLYGON ((478722.787 4431451.454, 478722.800 4...",3486,SHAUN CONRADWELLER,8,13,3265 34TH ST APT 45,"Duplicate Owner, Address & Geometry",1,POINT (478731.808 4431452.202)
217,WILLIAMMCINTYRE_1,WILLIAMMCINTYRE,"POLYGON ((478722.787 4431451.454, 478722.800 4...",2587,MARLENARICH,8,13,3265 34TH ST APT 56,"Duplicate Owner, Address & Geometry",1,POINT (478731.808 4431452.202)
217,WILLIAMMCINTYRE_1,WILLIAMMCINTYRE,"POLYGON ((478722.787 4431451.454, 478722.800 4...",3997,WILLIAMMCINTYRE,8,13,3265 34TH ST APT 58,"Duplicate Owner, Address & Geometry",1,POINT (478731.808 4431452.202)
217,WILLIAMMCINTYRE_1,WILLIAMMCINTYRE,"POLYGON ((478722.787 4431451.454, 478722.800 4...",608,CATHERINE FJOHNSON,8,13,3265 34TH ST APT 55,"Duplicate Owner, Address & Geometry",1,POINT (478731.808 4431452.202)


In [104]:
sp_join[['cluster_ID', 'OWNER_left', 'index_right', 'geometry']].to_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}_join.shp'))

In [105]:
cluster_to_parcel_ids = sp_join.groupby('cluster_ID')['index_right'].apply(list)
cluster_to_parcel_ids

cluster_ID
ANDREWLITTMAN_0                                               [182, 183]
ANTONIO JMARTINEZ_0                                     [259, 3677, 260]
APPLERIDGEPARK_0                                    [261, 264, 262, 263]
ARTHUR CJENSEN_0       [2363, 278, 278, 2740, 2740, 675, 676, 279, 2524]
BAOWENLI_0             [1285, 2800, 1916, 2709, 3424, 1050, 2862, 366...
                                             ...                        
WILLIAMACAR_0          [1165, 3989, 3661, 1593, 2140, 3981, 468, 3897...
WILLIAMFARROW_0        [2831, 3893, 3699, 2407, 2151, 1162, 2923, 213...
WILLIAMMCINTYRE_0      [688, 3726, 652, 3536, 649, 1949, 3567, 2788, ...
WILLIAMMCINTYRE_1      [739, 3535, 3151, 1948, 936, 1037, 2550, 3725,...
YIDAZHANG_0            [2831, 3893, 3699, 2407, 2151, 1162, 2923, 213...
Name: index_right, Length: 163, dtype: object

Pick up here, remove multiple small sliver polygons, and then rerun the above code. add a condition to only grab matching ownership before dissolve the sp_geom. 

In [112]:
super_parcels.head(2)

Unnamed: 0,cluster_ID,OWNER,geometry
0,ANDREWLITTMAN_0,ANDREWLITTMAN,"POLYGON ((474947.053 4431262.111, 474947.037 4..."
1,ANTONIO JMARTINEZ_0,ANTONIO JMARTINEZ,"POLYGON ((477902.899 4432366.623, 477870.489 4..."


In [113]:
final_super_parcels = gpd.GeoDataFrame()
for cluster_id, parcel_ids in cluster_to_parcel_ids.items():
    sp_geom = super_parcels[super_parcels['cluster_ID'] == cluster_id]
    
    
    for parcel_id in set(parcel_ids):
        print(f'{cluster_id} -> {parcel_id}')
        parcel_geom = parcels[parcels.index == parcel_id]
        if parcel_geom['OWNER'].values[0] == sp_join[sp_join['index_right'] == parcel_id]['OWNER_left'].values[0]:
            print('Owner match')
            sp_geom = pd.concat([sp_geom, parcel_geom], ignore_index=True)
            print('___')
        else:
            print('Owner mismatch')
            print('___')
            continue
        
    sp_geom_dissolve = sp_geom.dissolve().reset_index()[['cluster_ID', 'OWNER', 'geometry']]
    # drop cluster_id from super parcel
    super_parcels = super_parcels[super_parcels['cluster_ID'] != cluster_id]
    # add new super parcel
    super_parcels = pd.concat([super_parcels, sp_geom_dissolve], ignore_index=True)
    
    

ANDREWLITTMAN_0 -> 182
Owner match
___
ANDREWLITTMAN_0 -> 183
Owner match
___
ANTONIO JMARTINEZ_0 -> 259
Owner match
___
ANTONIO JMARTINEZ_0 -> 260
Owner match
___
ANTONIO JMARTINEZ_0 -> 3677
Owner mismatch
___
APPLERIDGEPARK_0 -> 264
Owner match
___
APPLERIDGEPARK_0 -> 261
Owner match
___
APPLERIDGEPARK_0 -> 262
Owner match
___
APPLERIDGEPARK_0 -> 263
Owner match
___
ARTHUR CJENSEN_0 -> 675
Owner mismatch
___
ARTHUR CJENSEN_0 -> 676
Owner mismatch
___
ARTHUR CJENSEN_0 -> 2740
Owner mismatch
___
ARTHUR CJENSEN_0 -> 278
Owner match
___
ARTHUR CJENSEN_0 -> 279
Owner match
___
ARTHUR CJENSEN_0 -> 2363
Owner mismatch
___
ARTHUR CJENSEN_0 -> 2524
Owner mismatch
___
BAOWENLI_0 -> 1285
Owner mismatch
___
BAOWENLI_0 -> 902
Owner mismatch
___
BAOWENLI_0 -> 3977
Owner mismatch
___
BAOWENLI_0 -> 1162
Owner mismatch
___
BAOWENLI_0 -> 2831
Owner mismatch
___
BAOWENLI_0 -> 3985
Owner mismatch
___
BAOWENLI_0 -> 2709
Owner mismatch
___
BAOWENLI_0 -> 534
Owner mismatch
___
BAOWENLI_0 -> 1050
Owner mism

In [114]:
super_parcels.to_file(os.path.join(data_dir, f'super_parcels_cleaned_{dbscan_distance}_final.shp'))

In [115]:
single_parcel_clusters

Index([], dtype='int64', name='cluster')