In [33]:
# Import required packages
import pandas as pd
import json


In [34]:
# Read GEOID
with open(r'C:\Users\lukej\OneDrive\Desktop\nyc-busyness\etl\census tract geofiles\manhattan_census_tracts.geojson', 'r') as f:
    geojson = json.load(f)

real_geoids = [feature['properties']['GEOID'] for feature in geojson['features']]
print(f"Found {len(real_geoids)} census tracts")

Found 310 census tracts


In [35]:
# Process historical foot traffic data
hist = pd.read_csv('foot_scores_years/all_foot_traffic_scores_with_daytime_category.csv')
hist['year'] = pd.to_datetime(hist['trip_date']).dt.year

print("Years in historical data before filtering:", sorted(hist['year'].unique()))
hist = hist[(hist['year'] >= 2019) & (hist['year'] <= 2030)]  # Keep only 2019-2023
print("Years in historical data after filtering:", sorted(hist['year'].unique()))

hist['col'] = hist['daytime_category'] + '_' + hist['year'].astype(str)
hist_wide = hist.pivot_table(index='LocationID', columns='col', values='daily_foot_traffic_score').reset_index()

print(f"Historical data shape: {hist_wide.shape}")
print("Historical columns:", [col for col in hist_wide.columns if col != 'LocationID'][:5])

Years in historical data before filtering: [2001, 2002, 2003, 2004, 2008, 2009, 2011, 2019, 2020, 2021, 2022, 2023, 2098]
Years in historical data after filtering: [2019, 2020, 2021, 2022, 2023]
Historical data shape: (67, 16)
Historical columns: ['afternoon_2019', 'afternoon_2020', 'afternoon_2021', 'afternoon_2022', 'afternoon_2023']


In [36]:
# Handle future data

future = pd.read_csv('future_foot_traffic.csv')
print("Years in future data before filtering:", sorted(future['year'].unique()))
future = future[future['year'] <= 2030]  # This line is fine
print("Years in future data after filtering:", sorted(future['year'].unique()))

Years in future data before filtering: [2025, 2026, 2027]
Years in future data after filtering: [2025, 2026, 2027]


In [37]:
# Combine historical and future data
foot_traffic = hist_wide.merge(future_wide, on='LocationID', how='outer').fillna(0)
unique_locations = sorted(foot_traffic['LocationID'].unique())
print(f"Combined data shape: {foot_traffic.shape}")
print(f"Unique LocationIDs: {unique_locations}")
print("All columns:", foot_traffic.columns.tolist())

Combined data shape: (67, 25)
Unique LocationIDs: [4, 12, 13, 24, 41, 42, 43, 45, 48, 50, 68, 74, 75, 79, 87, 88, 90, 100, 103, 107, 113, 114, 116, 120, 125, 127, 128, 137, 140, 141, 142, 143, 144, 148, 151, 152, 153, 158, 161, 162, 163, 164, 166, 170, 186, 194, 202, 209, 211, 224, 229, 230, 231, 232, 233, 234, 236, 237, 238, 239, 243, 244, 246, 249, 261, 262, 263]
All columns: ['LocationID', 'afternoon_2019', 'afternoon_2020', 'afternoon_2021', 'afternoon_2022', 'afternoon_2023', 'evening_2019', 'evening_2020', 'evening_2021', 'evening_2022', 'evening_2023', 'morning_2019', 'morning_2020', 'morning_2021', 'morning_2022', 'morning_2023', 'afternoon_pred_2025', 'afternoon_pred_2026', 'afternoon_pred_2027', 'evening_pred_2025', 'evening_pred_2026', 'evening_pred_2027', 'morning_pred_2025', 'morning_pred_2026', 'morning_pred_2027']


In [38]:
# Map LocationIDs to GEOIDs
result_rows = []
locations = sorted(foot_traffic['LocationID'].unique())
tracts_per_location = len(real_geoids) // len(locations)

print(f"Mapping {len(locations)} LocationIDs to {len(real_geoids)} GEOIDs")
print(f"Approximately {tracts_per_location} census tracts per LocationID")

geoid_index = 0
for i, location_id in enumerate(locations):
    location_data = foot_traffic[foot_traffic['LocationID'] == location_id].iloc[0]
    num_geoids = tracts_per_location + (1 if i < len(real_geoids) % len(locations) else 0)
    
    for j in range(num_geoids):
        if geoid_index < len(real_geoids):
            row = {'GEOID': real_geoids[geoid_index]}
            for col in foot_traffic.columns:
                if col != 'LocationID':
                    row[col] = location_data[col]
            result_rows.append(row)
            geoid_index += 1

print(f"Created {len(result_rows)} rows for GEOID mapping")

Mapping 67 LocationIDs to 310 GEOIDs
Approximately 4 census tracts per LocationID
Created 310 rows for GEOID mapping


In [39]:
# Create and save final csv
result_df = pd.DataFrame(result_rows)

In [40]:
# Sort columns
cols = ['GEOID'] + sorted([col for col in result_df.columns if col != 'GEOID'])
result_df = result_df[cols]

In [43]:
# Round values and sort by GEOID
result_df = result_df.round(2).sort_values('GEOID').reset_index(drop=True)

In [44]:
# Save to csv
result_df.to_csv('brickwyze_foot_traffic_real_geoid.csv', index=False)