In [1]:
import pandas as pd
import plotly.express as px
import os
os.environ['USE_PYGEOS'] = '0'
import geopandas as gpd

In [2]:
forsyth = gpd.read_file('Geography/Forsyth_CTs.gpkg')
forsyth.crs

<Geographic 2D CRS: EPSG:4269>
Name: NAD83
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: North America - onshore and offshore: Canada - Alberta; British Columbia; Manitoba; New Brunswick; Newfoundland and Labrador; Northwest Territories; Nova Scotia; Nunavut; Ontario; Prince Edward Island; Quebec; Saskatchewan; Yukon. Puerto Rico. United States (USA) - Alabama; Alaska; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Hawaii; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming. US Virgin Islands. British Virgin Islands

In [6]:
new_data = pd.read_csv('NewSales/Forsyth_2023.csv')

new_data['full_address'] = new_data['Address'] + ' Forsyth County, GA'

new_geocoded = pd.read_csv('NewSales/Forsyth_2023_geocoded.csv')

new_full = new_geocoded.merge(new_data, left_on='full_address', right_on='full_address')

# create geodataframe
new_full_gdf = gpd.GeoDataFrame(
    new_full, 
    geometry=gpd.points_from_xy(new_full['long'], new_full['lat']), 
    crs="EPSG:4269"
)

# create the 'year' column
new_full_gdf['year_sale'] = pd.DatetimeIndex(new_full_gdf['Sale Date']).year

# clean up columns
new_full_gdf.rename(columns={
    'Year  Built ':'year_blt',
    'Square Ft ':'Square Ft',
    }, inplace=True)

# create numeric sale column
new_full_gdf['price_number'] = new_full_gdf['Sale Price'].str.replace('[\$,]','',regex=True).str.replace(',','',regex=True).astype(float)

# create price/sf column
new_full_gdf['price_sf'] = new_full_gdf['price_number'] / new_full_gdf['Square Ft']

# spatial join
forsyth_joined = new_full_gdf.sjoin(forsyth, predicate="within")

# only select those sales greater than $1,000, where the size is greater than 75 SF, and includes qualified sales only
forsyth_joined = forsyth_joined[forsyth_joined['price_number'] > 1000]
forsyth_joined = forsyth_joined[forsyth_joined['Square Ft'] > 75]
forsyth_joined = forsyth_joined[forsyth_joined['Qualified Sales'] == 'Qualified']

# final conversions
forsyth_joined['GEOID'] = forsyth_joined['GEOID'].astype(str)
forsyth_joined['unique_ID'] = forsyth_joined['Address'] + '-' + forsyth_joined['Sale Date'].astype(str) + '-' + forsyth_joined['price_number'].astype(str)
forsyth_joined['year'] = pd.DatetimeIndex(forsyth_joined['Sale Date']).year
forsyth_joined['month'] = pd.DatetimeIndex(forsyth_joined['Sale Date']).month
forsyth_joined['year-month'] = forsyth_joined['year'].astype(str) + '-' + forsyth_joined['month'].astype(str)

forsyth_joined = forsyth_joined[[
    'Square Ft',
    'year_sale',
    'year_blt',
    'price_sf',
    'Sale Price',
    'GEOID',
    'Sub_geo',
    'unique_ID', 
    'year', 
    'month', 
    'year-month'
]]
forsyth_joined.columns

Index(['Square Ft', 'year_sale', 'year_blt', 'price_sf', 'Sale Price', 'GEOID',
       'Sub_geo', 'unique_ID', 'year', 'month', 'year-month'],
      dtype='object')

In [7]:
# this is the data to which we'll want to append the new 2023 data
df = pd.read_csv('Geocoded_Final_Joined.csv', thousands=',', keep_default_na=False)
df = df.drop(columns=['field_1'])

df.rename(columns={
        'Year  Built':'year_blt',
        'Year':'year_sale'
    }, inplace=True)
df['GEOID'] = df['GEOID'].astype(str)
df['unique_ID'] = df['Address'] + '-' + df['Sale Date'].astype(str) + '-' + df['price_number'].astype(str)

df['year'] = pd.DatetimeIndex(df['Sale Date']).year
df['month'] = pd.DatetimeIndex(df['Sale Date']).month
df['year-month'] = df['year'].astype(str) + '-' + df['month'].astype(str)


df = df[[
    'Square Ft',
    'year_sale',
    'year_blt',
    'price_sf',
    'Sale Price',
    'GEOID',
    'Sub_geo',
    'unique_ID', 
    'year', 
    'month', 
    'year-month'
]]

type(df)

pandas.core.frame.DataFrame

In [8]:
df_final = pd.concat([df, forsyth_joined])
df_final.to_csv('Geocoded_Final_Joined4.csv')