# Filtering down file

In [56]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as wkt_loads
from pathlib import Path

# -------------------------------
# 1. Load Census Tracts
# -------------------------------
tracts_path = Path("../../../census tract geofiles/manhattan_census_tracts.geojson")
tracts = gpd.read_file(tracts_path).to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# -------------------------------
# 2. Load Demographic CSV and Inspect Columns
# -------------------------------
nta_path = Path("nta_econ_primary_data.csv")
nta_df = pd.read_csv(nta_path)

# Print column names to find NTA and geometry fields
print("📋 Available columns in nta_econ_primary_data.csv:")
print(nta_df.columns.tolist())


📋 Available columns in nta_econ_primary_data.csv:
['GeoType', 'NTAType', 'GeogName', 'GeoID', 'Borough', 'Pop16plE', 'Pop16plM', 'Pop16plC', 'Pop16plP', 'Pop16plZ', 'LFE', 'LFM', 'LFC', 'LFP', 'LFZ', 'CvLF1E', 'CvLF1M', 'CvLF1C', 'CvLF1P', 'CvLF1Z', 'CvEm16pl1E', 'CvEm16pl1M', 'CvEm16pl1C', 'CvEm16pl1P', 'CvEm16pl1Z', 'CvLFUEm1E', 'CvLFUEm1M', 'CvLFUEm1C', 'CvLFUEm1P', 'CvLFUEm1Z', 'LFArmdFE', 'LFArmdFM', 'LFArmdFC', 'LFArmdFP', 'LFArmdFZ', 'NLF1E', 'NLF1M', 'NLF1C', 'NLF1P', 'NLF1Z', 'CvLF2E', 'CvLF2M', 'CvLF2C', 'CvLF2P', 'CvLF2Z', 'CvLFUEm2E', 'CvLFUEm2M', 'CvLFUEm2C', 'CvLFUEm2P', 'CvLFUEm2Z', 'F16plE', 'F16plM', 'F16plC', 'F16plP', 'F16plZ', 'F16plLFE', 'F16plLFM', 'F16plLFC', 'F16plLFP', 'F16plLFZ', 'F16plCvLFE', 'F16plCvLFM', 'F16plCvLFC', 'F16plCvLFP', 'F16plCvLFZ', 'F16plCLFEE', 'F16plCLFEM', 'F16plCLFEC', 'F16plCLFEP', 'F16plCLFEZ', 'OChU6E', 'OChU6M', 'OChU6C', 'OChU6P', 'OChU6Z', 'OChU6PLFE', 'OChU6PLFM', 'OChU6PLFC', 'OChU6PLFP', 'OChU6PLFZ', 'OCh6t17E', 'OCh6t17M', 'OCh6t

In [57]:
import pandas as pd

# Load economic data
df = pd.read_csv("nta_econ_primary_data.csv")

# Filter to Manhattan
df = df[df["Borough"].str.lower() == "manhattan"].copy()

# Columns to retain
columns_to_keep = [
    "GeoID", "HHIU10E", "HHI10t14E", "HHI15t24E", "HHI25t34E",
    "HHI35t49E", "HHI50t74E", "HHI75t99E", "HI100t149E", "HI150t199E",
    "HHI200plE", "MdHHIncE", "MnHHIncE"
]
df_filtered = df[columns_to_keep].copy()

# Rename to human-readable names
rename_map = {
    "HHIU10E": "Income <$10k",
    "HHI10t14E": "$10k–14.9k",
    "HHI15t24E": "$15k–24.9k",
    "HHI25t34E": "$25k–34.9k",
    "HHI35t49E": "$35k–49.9k",
    "HHI50t74E": "$50k–74.9k",
    "HHI75t99E": "$75k–99.9k",
    "HI100t149E": "$100k–149k",
    "HI150t199E": "$150k–199k",
    "HHI200plE": "$200k+",
    "MdHHIncE": "Median HH income ($)",
    "MnHHIncE": "Mean HH income ($)"
}
df_filtered.rename(columns=rename_map, inplace=True)

# Preview
print(df_filtered.head())


     GeoID Income <$10k $10k–14.9k $15k–24.9k $25k–34.9k $35k–49.9k  \
90  MN0101          544        392        358        417        908   
91  MN0102          250        181        227        154        161   
92  MN0201          730        398        620        520        508   
93  MN0202          640        233        247        486        740   
94  MN0203          841        411        814        571        889   

   $50k–74.9k $75k–99.9k $100k–149k $150k–199k  $200k+ Median HH income ($)  \
90      1,156      1,875      3,511      2,848  11,457              195,153   
91        810        563        796        935   5,592              200,000   
92      1,084        760      1,965      1,450   3,808              133,847   
93      1,594        872      2,154      1,656   6,995              175,436   
94      1,361      2,008      2,612      1,822   7,454              147,570   

   Mean HH income ($)  
90            268,791  
91            485,548  
92            258,036  
93

# Attaching shape file to this form 2020 NTA shape file in Census trat geofiles

In [58]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as wkt_loads

# 1. Load NTA shape file with geometry only
nta_shapes = pd.read_csv("../../../census tract geofiles/2020_Neighborhood_Tabulation_Areas__NTAs__20250605.csv")
nta_shapes["geometry"] = nta_shapes["the_geom"].apply(wkt_loads)
nta_gdf = gpd.GeoDataFrame(nta_shapes[["NTA2020", "BoroName", "geometry"]], geometry="geometry", crs="EPSG:4326")

# 2. Filter to Manhattan only
nta_gdf = nta_gdf[nta_gdf["BoroName"].str.lower() == "manhattan"].copy()

# 3. Merge with df_filtered on NTA2020 == GeoID
merged_gdf = nta_gdf.merge(df_filtered, left_on="NTA2020", right_on="GeoID", how="left")

# 4. Drop unused columns
merged_gdf.drop(columns=["GeoID", "BoroName"], inplace=True)

# 5. Check result
print("✅ Final merged GeoDataFrame shape:", merged_gdf.shape)
print(merged_gdf[["NTA2020", "Median HH income ($)", "geometry"]].head())

# Final GeoDataFrame: NTA2020, geometry, and renamed economic columns
merged_gdf.head(5)


✅ Final merged GeoDataFrame shape: (38, 14)
  NTA2020 Median HH income ($)  \
0  MN0101              195,153   
1  MN0102              200,000   
2  MN0191                  NaN   
3  MN0201              133,847   
4  MN0202              175,436   

                                            geometry  
0  MULTIPOLYGON (((-74.00078 40.69429, -74.00096 ...  
1  MULTIPOLYGON (((-73.99931 40.71755, -73.99945 ...  
2  MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...  
3  MULTIPOLYGON (((-74.00282 40.72836, -74.00272 ...  
4  MULTIPOLYGON (((-73.9899 40.73443, -73.98987 4...  


Unnamed: 0,NTA2020,geometry,Income <$10k,$10k–14.9k,$15k–24.9k,$25k–34.9k,$35k–49.9k,$50k–74.9k,$75k–99.9k,$100k–149k,$150k–199k,$200k+,Median HH income ($),Mean HH income ($)
0,MN0101,"MULTIPOLYGON (((-74.00078 40.69429, -74.00096 ...",544,392,358,417,908,1156,1875,3511,2848,11457,195153.0,268791.0
1,MN0102,"MULTIPOLYGON (((-73.99931 40.71755, -73.99945 ...",250,181,227,154,161,810,563,796,935,5592,200000.0,485548.0
2,MN0191,"MULTIPOLYGON (((-74.01093 40.68449, -74.01193 ...",0,0,0,0,0,0,0,0,0,0,,
3,MN0201,"MULTIPOLYGON (((-74.00282 40.72836, -74.00272 ...",730,398,620,520,508,1084,760,1965,1450,3808,133847.0,258036.0
4,MN0202,"MULTIPOLYGON (((-73.9899 40.73443, -73.98987 4...",640,233,247,486,740,1594,872,2154,1656,6995,175436.0,288647.0


In [59]:
from shapely.geometry import Polygon
import geopandas as gpd
import pandas as pd

# 1. Load census tract GeoJSON
tracts = gpd.read_file("../../../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# 2. Ensure CRS matches
merged_gdf = merged_gdf.to_crs(tracts.crs)

# 3. Intersect geometries to compute overlap
intersections = gpd.overlay(tracts, merged_gdf, how="intersection")

# 4. Calculate intersection area
intersections["intersect_area"] = intersections.geometry.area

# 5. Keep the NTA with the largest overlap for each tract
idx = intersections.groupby("tract_id")["intersect_area"].idxmax()
tract_economics = intersections.loc[idx].copy()

# 6. Merge original tract geometry back
tract_economics = tract_economics.merge(
    tracts[["tract_id", "geometry"]], on="tract_id", how="left"
)

# 7. Drop unnecessary columns but keep NTA2020
cols_to_drop = [
    "intersect_area", "geometry", 
    "NTA2020_2", "CDTA2020", "CDTANAME", 
    "CTLabel", "NTAName", "BoroName", "BoroCT2020", 
    "geometry_x", "geometry_y"
]
tract_economics.drop(columns=[col for col in cols_to_drop if col in tract_economics.columns], inplace=True)

# 8. Add Marble Hill (36061030900) with NaNs but correct geometry
missing_id = "36061030900"
if missing_id not in tract_economics["tract_id"].values:
    marble_geom = tracts.loc[tracts["tract_id"] == missing_id, "geometry"].values[0]
    columns = tract_economics.columns.tolist()
    nan_row = {col: pd.NA for col in columns if col not in ["tract_id", "geometry", "NTA2020"]}
    nan_row["tract_id"] = missing_id
    nan_row["geometry"] = marble_geom
    nan_row["NTA2020"] = pd.NA
    marble_hill = gpd.GeoDataFrame([nan_row], geometry="geometry", crs=tracts.crs)
    tract_economics = pd.concat([tract_economics, marble_hill], ignore_index=True)

# ✅ Final GeoDataFrame output
print("✅ Final tract-level economic GeoDataFrame:", tract_economics.shape)
print(tract_economics.dtypes)
tract_economics.head(10)


✅ Final tract-level economic GeoDataFrame: (310, 18)
GEOID                     object
NTA2020_1                 object
CT2020                    object
tract_id                  object
Income <$10k              object
$10k–14.9k                object
$15k–24.9k                object
$25k–34.9k                object
$35k–49.9k                object
$50k–74.9k                object
$75k–99.9k                object
$100k–149k                object
$150k–199k                object
$200k+                    object
Median HH income ($)      object
Mean HH income ($)        object
geometry                geometry
NTA2020                   object
dtype: object



  intersections["intersect_area"] = intersections.geometry.area


Unnamed: 0,GEOID,NTA2020_1,CT2020,tract_id,Income <$10k,$10k–14.9k,$15k–24.9k,$25k–34.9k,$35k–49.9k,$50k–74.9k,$75k–99.9k,$100k–149k,$150k–199k,$200k+,Median HH income ($),Mean HH income ($),geometry,NTA2020
0,36061000100,MN0191,100,36061000100,0,0,0,0,0,0,0,0,0,0,,,,
1,36061000201,MN0301,201,36061000201,2508,2791,2611,1960,1681,1776,1404,1680,843,1071,29559.0,68651.0,,
2,36061000202,MN0302,202,36061000202,1999,2071,2731,1562,2084,2168,2186,2980,1217,2700,54122.0,96129.0,,
3,36061000500,MN0191,500,36061000500,0,0,0,0,0,0,0,0,0,0,,,,
4,36061000600,MN0301,600,36061000600,2508,2791,2611,1960,1681,1776,1404,1680,843,1071,29559.0,68651.0,,
5,36061000700,MN0101,700,36061000700,544,392,358,417,908,1156,1875,3511,2848,11457,195153.0,268791.0,,
6,36061000800,MN0301,800,36061000800,2508,2791,2611,1960,1681,1776,1404,1680,843,1071,29559.0,68651.0,,
7,36061000900,MN0101,900,36061000900,544,392,358,417,908,1156,1875,3511,2848,11457,195153.0,268791.0,,
8,36061001001,MN0302,1001,36061001001,1999,2071,2731,1562,2084,2168,2186,2980,1217,2700,54122.0,96129.0,,
9,36061001002,MN0302,1002,36061001002,1999,2071,2731,1562,2084,2168,2186,2980,1217,2700,54122.0,96129.0,,


In [60]:
# Check if the tract exists in the original shapefile
tract_exists = '36061030900' in tracts["tract_id"].astype(str).values
print("🗺️ Tract exists in original census tracts:", tract_exists)

# Check if it made it into tract_economics
in_final_output = '36061030900' in tract_economics["tract_id"].astype(str).values
print("📊 Tract exists in final economic output:", in_final_output)


🗺️ Tract exists in original census tracts: True
📊 Tract exists in final economic output: True


In [61]:
import pandas as pd
import geopandas as gpd
from shapely.wkt import loads as wkt_loads

# Load census tracts
tracts = gpd.read_file("../../../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)
tracts["tract_id"] = tracts["GEOID"].astype(str)

# Load and filter economic data
econ_df = pd.read_csv("nta_econ_primary_data.csv")
econ_df = econ_df[econ_df["Borough"].str.lower() == "manhattan"].copy()

# Filter relevant columns
econ_filtered = econ_df[[
    "GeoID", "HHIU10E", "HHI10t14E", "HHI15t24E", "HHI25t34E", "HHI35t49E",
    "HHI50t74E", "HHI75t99E", "HI100t149E", "HI150t199E", "HHI200plE",
    "MdHHIncE", "MnHHIncE"
]]

# Load NTA shapes and merge
nta_shapes = pd.read_csv("../../../census tract geofiles/2020_Neighborhood_Tabulation_Areas__NTAs__20250605.csv")
nta_shapes["geometry"] = nta_shapes["the_geom"].apply(wkt_loads)
nta_gdf = gpd.GeoDataFrame(nta_shapes[["NTA2020", "BoroName", "geometry"]], geometry="geometry", crs="EPSG:4326")
nta_gdf = nta_gdf[nta_gdf["BoroName"].str.lower() == "manhattan"]

# Merge with economic data
merged_gdf = nta_gdf.merge(econ_filtered, left_on="NTA2020", right_on="GeoID", how="left")
merged_gdf.drop(columns=["GeoID", "BoroName"], inplace=True)

# Intersect with census tracts
merged_gdf = merged_gdf.to_crs(tracts.crs)
intersections = gpd.overlay(tracts[["tract_id", "geometry"]], merged_gdf, how="intersection")

# Preserve area and NTA code
intersections["intersect_area"] = intersections.geometry.area

# Take largest overlapping NTA per tract
idx = intersections.groupby("tract_id")["intersect_area"].idxmax()
tract_econ = intersections.loc[idx].copy()

# ✅ Preserve NTA2020 before dropping anything
if "NTA2020" not in tract_econ.columns:
    tract_econ = tract_econ.merge(
        merged_gdf[["NTA2020", "geometry"]],
        on="geometry",
        how="left"
    )


# Merge back original geometry
tract_econ = tract_econ.merge(tracts[["tract_id", "geometry"]], on="tract_id", how="left")

# Clean up fields
cols_to_drop = [
    "intersect_area", "geometry", 
    "CDTA2020", "CDTANAME", "CTLabel", 
    "NTAName", "BoroName", "BoroCT2020", 
    "geometry_x", "geometry_y"
]
tract_econ.drop(columns=[col for col in cols_to_drop if col in tract_econ.columns], inplace=True)

# Rename NTA2020 → NTA2020
tract_econ.rename(columns={"NTA2020": "NTA2020", "tract_id": "tract_id"}, inplace=True)

# Extract CT2020 from GEOID
tract_econ["CT2020"] = tract_econ["tract_id"].str[-5:].astype(int)
tract_econ["GEOID"] = tract_econ["tract_id"]

# Reorder columns to match demographic file
column_order = [
    "tract_id", "NTA2020", "CT2020", "GEOID",
    "HHIU10E", "HHI10t14E", "HHI15t24E", "HHI25t34E", "HHI35t49E",
    "HHI50t74E", "HHI75t99E", "HI100t149E", "HI150t199E", "HHI200plE",
    "MdHHIncE", "MnHHIncE"
]
tract_econ = tract_econ.reindex(columns=column_order)

# Append manually filled row for Marble Hill
missing_id = "36061030900"
if missing_id not in tract_econ["tract_id"].values:
    marble_row = pd.Series({
        "tract_id": missing_id,
        "NTA2020": pd.NA,
        "CT2020": 30900,
        "GEOID": missing_id,
        "HHIU10E": 946,
        "HHI10t14E": 818,
        "HHI15t24E": 921,
        "HHI25t34E": 892,
        "HHI35t49E": 1024,
        "HHI50t74E": 1867,
        "HHI75t99E": 1006,
        "HI100t149E": 1272,
        "HI150t199E": 484,
        "HHI200plE": 362,
        "MdHHIncE": 52386,
        "MnHHIncE": 68849
    })
    tract_econ = pd.concat([tract_econ, pd.DataFrame([marble_row])], ignore_index=True)

# Export to CSV
tract_econ.to_csv("nta_economics_by_tract.csv", index=False)
print("✅ Exported with proper formatting and filled Marble Hill row.")


✅ Exported with proper formatting and filled Marble Hill row.



  intersections["intersect_area"] = intersections.geometry.area
