In [3]:
import pandas as pd
import geopandas as gpd
import topojson as tp
import re
import warnings

# read in sales
sales_df = pd.read_csv('Data/CherokeeSales.csv', dtype={'GEOID': 'str'})

# read in census tracts
cherokee_ct = gpd.read_file('Data/cherokee_CTs.gpkg')

# pare down the dataframe
cherokee_ct = cherokee_ct[[
    'GEOID',
    'Sub_geo',
    'geometry'
]]

replacement_dict = {
    'Centeral': 'Central',
    'S.': 'South'
}

cherokee_ct['Sub_geo'] = cherokee_ct['Sub_geo'].replace(
    replacement_dict, regex=True)

# join the sales to the tracts to get the sub geometry of each sale
sales_df = pd.merge(
    sales_df,
    cherokee_ct,
    how='left',
    left_on='GEOID',
    right_on='GEOID'
)

# ignore the warnings that come with simplifying geographically
warnings.filterwarnings("ignore", category=RuntimeWarning)

toposimplify = 0.001
cherokee_simp = tp.Topology(cherokee_ct, toposimplify=toposimplify).to_gdf()

# export simplified geometry
cherokee_simp.to_file('Data/cherokee_CTs_simp.gpkg')

# Now that it's joined, drop geometry and lat/longs
sales_df = sales_df.drop(columns=[
    'geometry'
])

# drop NaN rows for any columns used in the filters (basically just year & yr_blt)
sales_df = sales_df.dropna(subset='yr_blt')

# export
sales_df.to_csv('Data/Cherokee_20-24.csv', index=False)

# print('export complete!')
print(f'dataframe rows: {sales_df.shape[0]:,}')
print('------')
sales_df.head(3)

dataframe rows: 21,048
------


Unnamed: 0,GEOID,year,month,year-month,sale_amt,home_size,yr_blt,price_sf,Sub_geo
0,13057090101,2023,7,2023-7,319000.0,2128.0,2013.0,149.906015,East Cherokee
1,13057090101,2023,8,2023-8,190000.0,1040.0,1998.0,182.692308,East Cherokee
2,13057090101,2021,9,2021-9,70000.0,1274.0,1997.0,54.945055,East Cherokee
