In [15]:
import os
import geopandas as gpd
import pandas as pd
from shapely import wkt
import glob

In [5]:
data_path = r'D:\Projects\superparcels\data\abreunig_pocs_spatialrecord_polygon_superparcel_urban.csv'

In [12]:
data_dir = r'D:\Projects\superparcels\data\Urban'

In [9]:
df = pd.read_csv(data_path, dtype={'FIPS': str})

In [10]:
fips = df['FIPS'].unique()


In [11]:
fips

array(['06075', '08031', '06001', '48113'], dtype=object)

In [13]:
for fip in fips:
    print(fip)
    df_fip = df[df['FIPS'] == fip]
    gdf = gpd.GeoDataFrame(df_fip, geometry=df_fip['geometry'].apply(wkt.loads), crs=4326)
    gdf.to_file(os.path.join(data_dir, f'sp_sample_{fip}.shp'))


06075
08031
06001
48113


Clean parcel urban and remove duplicate geometries for clustering POC

In [16]:
input_dir = r'D:\Projects\superparcels\data\Urban'

In [None]:
for fi in glob.glob(os.path.join(input_dir, '*\*.shp')):
    fips = os.path.basename(fi).split('_')[-1].split('.')[0]
    df = gpd.read_file(fi)

In [23]:
# Identify duplicate owners, addresses, and geometries
df['duplicate_owner'] = df.duplicated(subset=['OWNER'], keep=False)
df['duplicate_address'] = df.duplicated(subset=['std_addr'], keep=False)
df['duplicate_geometry'] = df.duplicated(subset=['geometry'], keep=False)

# Create a classification column based on duplication status (with geometry)
df['classification'] = df.apply(
    lambda row: (
        'Class1: Duplicate Owner, Address & Geometry' if row['duplicate_owner'] and row['duplicate_address'] and row['duplicate_geometry'] else
        'Class2: Duplicate Owner & Address, Unique Geometry' if row['duplicate_owner'] and row['duplicate_address'] and not row['duplicate_geometry'] else
        'Class3: Duplicate Owner & Unique Address, Geometry' if row['duplicate_owner'] and not row['duplicate_address'] and not row['duplicate_geometry'] else
        'Class4: Unique Owner & Duplicate Address & Geometry' if not row['duplicate_owner'] and row['duplicate_address'] and row['duplicate_geometry'] else
        'Class5: Unique Owner & Address, Duplicate Geometry' if not row['duplicate_owner'] and not row['duplicate_address'] and row['duplicate_geometry'] else
        'Class6: Unique Owner & Address & Geometry'
    ), axis=1
)

# Count the occurrences of each classification
matrix = df['classification'].value_counts()

print(matrix)


classification
Class6: Unique Owner & Address & Geometry              14822
Class5: Unique Owner & Address, Duplicate Geometry      5787
Class1: Duplicate Owner, Address & Geometry             3211
Class3: Duplicate Owner & Unique Address, Geometry      2111
Class2: Duplicate Owner & Address, Unique Geometry       434
Class4: Unique Owner & Duplicate Address & Geometry       16
Name: count, dtype: int64


In [24]:
# create clasification codes
df['classification_code'] = df['classification'].apply(lambda x: x.split(':')[0][5])
df['classification_code'] = df['classification_code'].astype(int)
df['classification_code'].value_counts()

classification_code
6    14822
5     5787
1     3211
3     2111
2      434
4       16
Name: count, dtype: int64

In [None]:
df.head(3)

In [None]:
class_code = 2
code = df[df['classification_code'] == class_code]
code.to_file(os.path.join(input_dir, f'sp_sample_08013_code{class_code}.shp'))

In [None]:
cluster_canidate_codes = [2,3] # classes after dissolving
cluster = df[df['classification_code'].isin(cluster_canidate_codes)]
cluster.to_file(os.path.join(input_dir, f'sp_sample_08013_cluster_canidates.shp'))