In [25]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import box
from shapely import wkt


demo_df = pd.read_csv("atl_data_numeric.csv").drop('Unnamed: 0', axis=1).rename(columns={'Geography': 'GEOID'})

grid_df = pd.read_csv("atlanta_grid_data.csv")

tract_df = pd.read_csv("atl_census_tracts_with_data.csv")
tract_df['geometry'] = tract_df['geometry'].apply(wkt.loads)
tracts_gdf = gpd.GeoDataFrame(tract_df, geometry='geometry', crs="EPSG:4326")

def make_grid_poly(row):
    return box(row['Min Long'], row['Min Lat'], row['Max Long'], row['Max Lat'])

grid_df['geometry'] = grid_df.apply(make_grid_poly, axis=1)
grid_gdf = gpd.GeoDataFrame(grid_df, geometry='geometry', crs="EPSG:4326")


In [26]:
demo_df

Unnamed: 0,GEOID,Geography_name,total_population,population_density,male_percent,percent_inhouseholds,percent_marriedhouseholds,percent_cohabitingcouple,percent_solomale,percent_solofemale,...,percent_novehicles,percent_1vehicle_perperson,percent_lessthan1vehicle_perperson,under25y_percent_highschool,under25y_percent_somecollege,under25y_percent_bachelors,over25y_percent_highschool,over25y_percent_somecollege,over25y_percent_bachelors,over25y_percent_graduatedegree
0,1400000US13121000100,Census Tract 1; Fulton County; Georgia,5679,4426.344505,0.494982,1.000000,0.213066,0.033809,0.051065,0.066385,...,0.000000,0.235781,0.127487,0.077,0.220,0.415,0.044,0.151,0.429,0.361
1,1400000US13121000201,Census Tract 2.01; Fulton County; Georgia,2751,5535.211268,0.533261,1.000000,0.187205,0.054526,0.100327,0.086877,...,0.000000,0.303526,0.065431,0.387,0.240,0.373,0.057,0.124,0.388,0.388
2,1400000US13121000202,Census Tract 2.02; Fulton County; Georgia,3316,5400.651466,0.520808,0.990048,0.216224,0.030157,0.053076,0.055187,...,0.002413,0.197527,0.247889,0.000,0.195,0.705,0.057,0.039,0.323,0.544
3,1400000US13121000400,Census Tract 4; Fulton County; Georgia,1865,3103.161398,0.538874,0.995710,0.204826,0.034316,0.156032,0.147989,...,0.055764,0.345845,0.149598,0.111,0.758,0.131,0.041,0.139,0.272,0.534
4,1400000US13121000501,Census Tract 5.01; Fulton County; Georgia,4281,8845.041322,0.558748,1.000000,0.098342,0.059799,0.231254,0.146695,...,0.029199,0.266293,0.117496,0.043,0.504,0.453,0.092,0.132,0.485,0.292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,1400000US13121011801,Census Tract 118.01; Fulton County; Georgia,1322,5981.900452,0.475038,1.000000,0.018911,0.022693,0.220877,0.239032,...,0.033283,0.158094,0.054463,0.076,0.197,0.727,0.152,0.087,0.399,0.198
306,1400000US13121011802,Census Tract 118.02; Fulton County; Georgia,1370,2691.552063,0.500000,0.981022,0.035766,0.039416,0.127737,0.086131,...,0.112409,0.201460,0.252555,0.269,0.293,0.281,0.227,0.229,0.224,0.130
307,1400000US13121011902,Census Tract 119.02; Fulton County; Georgia,1377,10051.094890,0.463326,0.959332,0.051561,0.051561,0.159768,0.198983,...,0.212055,0.246187,0.285403,0.543,0.000,0.457,0.220,0.093,0.286,0.209
308,1400000US13121012000,Census Tract 120; Fulton County; Georgia,3408,3704.347826,0.522594,0.882629,0.036092,0.023768,0.120012,0.091549,...,0.144366,0.245012,0.223592,0.357,0.038,0.183,0.462,0.184,0.195,0.044


In [27]:
intersections = gpd.overlay(tracts_gdf, grid_gdf, how='intersection')

intersections['intersection_area'] = intersections.geometry.area
tracts_gdf['tract_area'] = tracts_gdf.geometry.area

intersections = intersections.merge(
    tracts_gdf[['GEOID', 'tract_area']],
    on='GEOID'
)
intersections['area_ratio'] = intersections['intersection_area'] / intersections['tract_area']



  intersections['intersection_area'] = intersections.geometry.area

  tracts_gdf['tract_area'] = tracts_gdf.geometry.area


In [28]:
demo_df = demo_df.drop(columns=['Geography_name']).set_index('GEOID').add_prefix(f"{"demographics"}_")

# demo_merged = (
#     merge_on_geo(income_df, 'income')
#     .join(merge_on_geo(education_df, 'edu'), how='outer')
#     .join(merge_on_geo(transport_df, 'trans'), how='outer')
#     .join(merge_on_geo(population_df, 'pop'), how='outer')
# )

intersections = intersections.set_index('GEOID').join(demo_df, how='left').reset_index()


In [29]:
for col in intersections.columns:
    if col not in ['GEOID', 'Grid ID', 'intersection_area', 'tract_area', 'area_ratio', 'geometry']:
        intersections[col] = pd.to_numeric(intersections[col], errors='coerce')

feature_cols = [
    col for col in intersections.columns
    if col not in ['GEOID', 'Grid ID', 'intersection_area', 'tract_area', 'area_ratio', 'geometry']
]

for col in feature_cols:
    intersections[col] = intersections[col] * intersections['area_ratio']

cleaned_data = intersections.groupby('Grid ID')[feature_cols].sum().reset_index()

In [30]:
cleaned_data = cleaned_data.drop(['GEO_ID', 'Unnamed: 0'], axis=1, errors='ignore')
cleaned_data = cleaned_data.drop(['Min Lat', 'Max Lat', 'Min Long', 'Max Long', 'Grid ID'], axis=1).dropna().reset_index().drop('index', axis=1)
# cleaned_data = cleaned_data[cleaned_data['Ride Count'] != 0]
cleaned_data = cleaned_data.dropna()
cleaned_data = cleaned_data[cleaned_data['demographics_total_population'] != 0]
cleaned_data.to_csv("atl_grid_demographics_numeric.csv", index=False)