# Filtering down file

In [118]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as wkt_loads
from pathlib import Path

# -------------------------------
# 1. Load Census Tracts
# -------------------------------
tracts_path = Path("../../../census tract geofiles/manhattan_census_tracts.geojson")
tracts = gpd.read_file(tracts_path).to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# -------------------------------
# 2. Load Demographic CSV and Inspect Columns
# -------------------------------
nta_path = Path("nta_demographics.csv")
nta_df = pd.read_csv(nta_path)

# Print column names to find NTA and geometry fields
print("📋 Available columns in nta_demographics.csv:")
print(nta_df.columns.tolist())


📋 Available columns in nta_demographics.csv:
['GeoType', 'NTAType', 'GeogName', 'GeoID', 'Borough', 'Pop_1E', 'Pop_1M', 'Pop_1C', 'Pop_1P', 'Pop_1Z', 'MaleE', 'MaleM', 'MaleC', 'MaleP', 'MaleZ', 'FemE', 'FemM', 'FemC', 'FemP', 'FemZ', 'PopU5E', 'PopU5M', 'PopU5C', 'PopU5P', 'PopU5Z', 'Pop5t9E', 'Pop5t9M', 'Pop5t9C', 'Pop5t9P', 'Pop5t9Z', 'Pop10t14E', 'Pop10t14M', 'Pop10t14C', 'Pop10t14P', 'Pop10t14Z', 'Pop15t19E', 'Pop15t19M', 'Pop15t19C', 'Pop15t19P', 'Pop15t19Z', 'Pop20t24E', 'Pop20t24M', 'Pop20t24C', 'Pop20t24P', 'Pop20t24Z', 'Pop25t29E', 'Pop25t29M', 'Pop25t29C', 'Pop25t29P', 'Pop25t29Z', 'Pop30t34E', 'Pop30t34M', 'Pop30t34C', 'Pop30t34P', 'Pop30t34Z', 'Pop35t39E', 'Pop35t39M', 'Pop35t39C', 'Pop35t39P', 'Pop35t39Z', 'Pop40t44E', 'Pop40t44M', 'Pop40t44C', 'Pop40t44P', 'Pop40t44Z', 'Pop45t49E', 'Pop45t49M', 'Pop45t49C', 'Pop45t49P', 'Pop45t49Z', 'Pop50t54E', 'Pop50t54M', 'Pop50t54C', 'Pop50t54P', 'Pop50t54Z', 'Pop55t59E', 'Pop55t59M', 'Pop55t59C', 'Pop55t59P', 'Pop55t59Z', 'Pop60t64E

In [119]:
import pandas as pd

# Load demographics CSV
df = pd.read_csv("nta_demographics.csv")

# Filter to Manhattan
df = df[df["Borough"].str.lower() == "manhattan"].copy()

# Columns to retain
columns_to_keep = [
    "GeoID", "Pop_1E", "MaleP", "FemP",
    "PopU5P", "Pop5t9P", "Pop10t14P", "Pop15t19P", "Pop20t24P",
    "Pop25t29P", "Pop30t34P", "Pop35t39P", "Pop40t44P", "Pop45t49P",
    "Pop50t54P", "Pop55t59P", "Pop60t64P", "Pop65t69P", "Pop70t74P",
    "Pop75t79P", "Pop80t84P", "Pop85plP", "PopU181P", "Pop65pl1P", "MdAgeE"
]
df_filtered = df[columns_to_keep].copy()

# Rename to human-readable names
rename_map = {
    "Pop_1E": "Total population",
    "MaleP": "Male (%)",
    "FemP": "Female (%)",
    "PopU5P": "Under 5 years (%)",
    "Pop5t9P": "5 to 9 years (%)",
    "Pop10t14P": "10 to 14 years (%)",
    "Pop15t19P": "15 to 19 years (%)",
    "Pop20t24P": "20 to 24 years (%)",
    "Pop25t29P": "25 to 29 years (%)",
    "Pop30t34P": "30 to 34 years (%)",
    "Pop35t39P": "35 to 39 years (%)",
    "Pop40t44P": "40 to 44 years (%)",
    "Pop45t49P": "45 to 49 years (%)",
    "Pop50t54P": "50 to 54 years (%)",
    "Pop55t59P": "55 to 59 years (%)",
    "Pop60t64P": "60 to 64 years (%)",
    "Pop65t69P": "65 to 69 years (%)",
    "Pop70t74P": "70 to 74 years (%)",
    "Pop75t79P": "75 to 79 years (%)",
    "Pop80t84P": "80 to 84 years (%)",
    "Pop85plP": "85 years and over (%)",
    "PopU181P": "Under 18 years (%)",
    "Pop65pl1P": "65 years and over (%)",
    "MdAgeE": "Median age (years)"
}
df_filtered.rename(columns=rename_map, inplace=True)

# Preview
print(df_filtered.head())

     GeoID Total population  Male (%)  Female (%)  Under 5 years (%)  \
90  MN0101           48,693      47.4        52.6                5.0   
91  MN0102           23,777      50.8        49.2                6.0   
92  MN0201           21,940      47.3        52.7                2.3   
93  MN0202           31,968      48.5        51.5                3.0   
94  MN0203           31,798      49.6        50.4                2.6   

    5 to 9 years (%)  10 to 14 years (%)  15 to 19 years (%)  \
90               3.2                 3.9                 4.3   
91               5.4                 4.3                 3.9   
92               1.3                 4.1                 3.4   
93               2.3                 1.3                10.0   
94               1.4                 2.5                 1.9   

    20 to 24 years (%)  25 to 29 years (%)  ...  55 to 59 years (%)  \
90                 8.3                15.0  ...                 4.1   
91                 8.0                 8

# Attaching shape file to this form 2020 NTA shape file in Census trat geofiles

In [120]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as wkt_loads

# 1. Load NTA shape file with geometry only
nta_shapes = pd.read_csv("../../../census tract geofiles/2020_Neighborhood_Tabulation_Areas__NTAs__20250605.csv")
nta_shapes["geometry"] = nta_shapes["the_geom"].apply(wkt_loads)
nta_gdf = gpd.GeoDataFrame(nta_shapes[["NTA2020", "BoroName", "geometry"]], geometry="geometry", crs="EPSG:4326")

# 2. Filter to Manhattan only
nta_gdf = nta_gdf[nta_gdf["BoroName"].str.lower() == "manhattan"].copy()

# 3. Merge with df_filtered on NTA2020 == GeoID
merged_gdf = nta_gdf.merge(df_filtered, left_on="NTA2020", right_on="GeoID", how="left")

# 4. Drop unused columns
merged_gdf.drop(columns=["GeoID", "BoroName"], inplace=True)

# 5. Check result
print("✅ Final merged GeoDataFrame shape:", merged_gdf.shape)
print(merged_gdf[["NTA2020", "Total population", "geometry"]].head())

# Final GeoDataFrame: NTA2020, geometry, and renamed demographic columns
merged_gdf.head(5)


✅ Final merged GeoDataFrame shape: (38, 26)
  NTA2020 Total population                                           geometry
0  MN0101           48,693  MULTIPOLYGON (((-74.00078 40.69429, -74.00096 ...
1  MN0102           23,777  MULTIPOLYGON (((-73.99931 40.71755, -73.99945 ...
2  MN0191                0  MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...
3  MN0201           21,940  MULTIPOLYGON (((-74.00282 40.72836, -74.00272 ...
4  MN0202           31,968  MULTIPOLYGON (((-73.9899 40.73443, -73.98987 4...


Unnamed: 0,NTA2020,geometry,Total population,Male (%),Female (%),Under 5 years (%),5 to 9 years (%),10 to 14 years (%),15 to 19 years (%),20 to 24 years (%),...,55 to 59 years (%),60 to 64 years (%),65 to 69 years (%),70 to 74 years (%),75 to 79 years (%),80 to 84 years (%),85 years and over (%),Under 18 years (%),65 years and over (%),Median age (years)
0,MN0101,"MULTIPOLYGON (((-74.00078 40.69429, -74.00096 ...",48693,47.4,52.6,5.0,3.2,3.9,4.3,8.3,...,4.1,4.2,3.4,1.9,0.6,1.1,2.0,13.8,9.0,34.3
1,MN0102,"MULTIPOLYGON (((-73.99931 40.71755, -73.99945 ...",23777,50.8,49.2,6.0,5.4,4.3,3.9,8.0,...,5.3,3.1,4.5,2.3,2.4,2.1,1.4,17.8,12.8,37.7
2,MN0191,"MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...",0,,,,,,,,...,,,,,,,,,,
3,MN0201,"MULTIPOLYGON (((-74.00282 40.72836, -74.00272 ...",21940,47.3,52.7,2.3,1.3,4.1,3.4,9.4,...,4.8,4.1,6.5,5.0,3.8,1.2,2.0,9.3,18.5,39.4
4,MN0202,"MULTIPOLYGON (((-73.9899 40.73443, -73.98987 4...",31968,48.5,51.5,3.0,2.3,1.3,10.0,10.0,...,4.4,3.6,5.3,4.2,3.6,2.7,2.1,7.7,17.8,34.8


In [121]:
from shapely.geometry import Polygon
import geopandas as gpd
import pandas as pd

# 1. Load census tract GeoJSON
tracts = gpd.read_file("../../../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# 2. Ensure CRS matches
merged_gdf = merged_gdf.to_crs(tracts.crs)

# 3. Intersect geometries to compute overlap
intersections = gpd.overlay(tracts, merged_gdf, how="intersection")

# 4. Calculate intersection area
intersections["intersect_area"] = intersections.geometry.area

# 5. Keep the NTA with the largest overlap for each tract
idx = intersections.groupby("tract_id")["intersect_area"].idxmax()
tract_demographics = intersections.loc[idx].copy()

# 6. Merge original tract geometry back
tract_demographics = tract_demographics.merge(
    tracts[["tract_id", "geometry"]], on="tract_id", how="left"
)

# 7. Drop unnecessary columns but keep NTA2020
cols_to_drop = [
    "intersect_area", "geometry", 
    "NTA2020_2", "CDTA2020", "CDTANAME", 
    "CTLabel", "NTAName", "BoroName", "BoroCT2020", 
    "geometry_x", "geometry_y"
]
tract_demographics.drop(columns=[col for col in cols_to_drop if col in tract_demographics.columns], inplace=True)

# 8. Add Marble Hill (36061030900) with NaNs but correct geometry
missing_id = "36061030900"
if missing_id not in tract_demographics["tract_id"].values:
    marble_geom = tracts.loc[tracts["tract_id"] == missing_id, "geometry"].values[0]
    columns = tract_demographics.columns.tolist()
    nan_row = {col: pd.NA for col in columns if col not in ["tract_id", "geometry", "NTA2020"]}
    nan_row["tract_id"] = missing_id
    nan_row["geometry"] = marble_geom
    nan_row["NTA2020"] = pd.NA
    marble_hill = gpd.GeoDataFrame([nan_row], geometry="geometry", crs=tracts.crs)
    tract_demographics = pd.concat([tract_demographics, marble_hill], ignore_index=True)

# ✅ Final GeoDataFrame output
print("✅ Final tract-level demographic GeoDataFrame:", tract_demographics.shape)
tract_demographics.dtypes
tract_demographics.head(10)


✅ Final tract-level demographic GeoDataFrame: (310, 30)



  intersections["intersect_area"] = intersections.geometry.area
  tract_demographics = pd.concat([tract_demographics, marble_hill], ignore_index=True)


Unnamed: 0,GEOID,NTA2020_1,CT2020,tract_id,Total population,Male (%),Female (%),Under 5 years (%),5 to 9 years (%),10 to 14 years (%),...,65 to 69 years (%),70 to 74 years (%),75 to 79 years (%),80 to 84 years (%),85 years and over (%),Under 18 years (%),65 years and over (%),Median age (years),geometry,NTA2020
0,36061000100,MN0191,100,36061000100,0,,,,,,...,,,,,,,,,,
1,36061000201,MN0301,201,36061000201,40100,50.1,49.9,2.4,3.7,6.3,...,7.2,7.1,3.9,3.9,5.0,14.8,27.0,47.9,,
2,36061000202,MN0302,202,36061000202,45800,48.8,51.2,3.1,4.0,3.2,...,5.2,5.8,3.4,3.3,3.1,12.5,20.9,42.1,,
3,36061000500,MN0191,500,36061000500,0,,,,,,...,,,,,,,,,,
4,36061000600,MN0301,600,36061000600,40100,50.1,49.9,2.4,3.7,6.3,...,7.2,7.1,3.9,3.9,5.0,14.8,27.0,47.9,,
5,36061000700,MN0101,700,36061000700,48693,47.4,52.6,5.0,3.2,3.9,...,3.4,1.9,0.6,1.1,2.0,13.8,9.0,34.3,,
6,36061000800,MN0301,800,36061000800,40100,50.1,49.9,2.4,3.7,6.3,...,7.2,7.1,3.9,3.9,5.0,14.8,27.0,47.9,,
7,36061000900,MN0101,900,36061000900,48693,47.4,52.6,5.0,3.2,3.9,...,3.4,1.9,0.6,1.1,2.0,13.8,9.0,34.3,,
8,36061001001,MN0302,1001,36061001001,45800,48.8,51.2,3.1,4.0,3.2,...,5.2,5.8,3.4,3.3,3.1,12.5,20.9,42.1,,
9,36061001002,MN0302,1002,36061001002,45800,48.8,51.2,3.1,4.0,3.2,...,5.2,5.8,3.4,3.3,3.1,12.5,20.9,42.1,,


In [122]:
# Check if the tract exists in the original shapefile
tract_exists = '36061030900' in tracts["tract_id"].astype(str).values
print("🗺️ Tract exists in original census tracts:", tract_exists)

# Check if it made it into tract_demographics
in_final_output = '36061030900' in tract_demographics["GEOID"].astype(str).values
print("📊 Tract exists in final demographics:", in_final_output)


🗺️ Tract exists in original census tracts: True
📊 Tract exists in final demographics: False


In [123]:
from shapely.geometry import Polygon
import geopandas as gpd
import pandas as pd

# 1. Load census tract GeoJSON
tracts = gpd.read_file("../../../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# 2. Ensure CRS matches
merged_gdf = merged_gdf.to_crs(tracts.crs)

# 3. Intersect geometries
intersections = gpd.overlay(tracts, merged_gdf, how="intersection")
intersections["intersect_area"] = intersections.geometry.area

# 4. Keep max-overlap per tract
idx = intersections.groupby("tract_id")["intersect_area"].idxmax()
tract_demographics = intersections.loc[idx].copy()

# 5. Merge tract geometry back in
tract_demographics = tract_demographics.merge(
    tracts[["tract_id", "geometry"]], on="tract_id", how="left"
)

# 6. Drop unneeded fields but keep NTA2020
cols_to_drop = [
    "intersect_area", "geometry", 
    "NTA2020_2", "CDTA2020", "CDTANAME", 
    "CTLabel", "NTAName", "BoroName", "BoroCT2020", 
    "geometry_x", "geometry_y"
]
tract_demographics.drop(columns=[col for col in cols_to_drop if col in tract_demographics.columns], inplace=True)

# 7. Manually add Marble Hill tract (36061030900)
missing_id = "36061030900"
if missing_id not in tract_demographics["tract_id"].values:
    marble_geom = tracts.loc[tracts["tract_id"] == missing_id, "geometry"].values[0]

    # Build dict from existing columns with NaN
    marble_row = {col: pd.NA for col in tract_demographics.columns}
    marble_row["tract_id"] = missing_id
    marble_row["geometry"] = marble_geom

    # Optionally fill in GEOID, CT2020, NTA2020_1 if expected
    if "GEOID" in marble_row: marble_row["GEOID"] = missing_id
    if "CT2020" in marble_row: marble_row["CT2020"] = 30900
    if "NTA2020_1" in marble_row: marble_row["NTA2020_1"] = pd.NA  # or appropriate code if known

    # Fill in demographic values
    marble_row.update({
        "Total population": 8594,
        "Male (%)": 43.8,
        "Female (%)": 56.2,
        "Under 5 years (%)": 5.6,
        "5 to 9 years (%)": 5.6,
        "10 to 14 years (%)": 6.7,
        "15 to 19 years (%)": 6.2,
        "20 to 24 years (%)": 6.8,
        "25 to 29 years (%)": 8.1,
        "30 to 34 years (%)": 8.0,
        "35 to 39 years (%)": 6.9,
        "40 to 44 years (%)": 6.1,
        "45 to 49 years (%)": 6.1,
        "50 to 54 years (%)": 6.6,
        "55 to 59 years (%)": 6.4,
        "60 to 64 years (%)": 5.9,
        "65 to 69 years (%)": 5.2,
        "70 to 74 years (%)": 4.1,
        "75 to 79 years (%)": 2.9,
        "80 to 84 years (%)": 1.7,
        "85 years and over (%)": 1.3,
        "Under 18 years (%)": 21.6,
        "65 years and over (%)": 15.2,
        "Median age (years)": 37.0,
    })

    # Create DataFrame and append
    marble_df = gpd.GeoDataFrame([marble_row], geometry="geometry", crs=tracts.crs)
    tract_demographics = pd.concat([tract_demographics, marble_df], ignore_index=True)

    tract_demographics_no_geom = tract_demographics.drop(columns=["geometry"])
    tract_demographics_no_geom.to_csv("nta_demographics_by_tract.csv", index=False)



  intersections["intersect_area"] = intersections.geometry.area
