In [None]:
import pandas as pd
import camelot # pdf reading
import geopandas as gpd # make the df into a geodataframe
from shapely import wkt # convert the multipolygon column (WKT) into shapely geometry for geopandas

In [None]:
demographics_pdf = "Demographics_by_NTA.pdf"
tabulation_areas_csv = "2020_Neighborhood_Tabulation_Areas_(NTAs)_20251202.csv"

tab_areas = pd.read_csv(tabulation_areas_csv, skiprows=1)

# deal with PDF first, needs more effort
demo_df = camelot.read_pdf(demographics_pdf, pages="2-end", flavor="stream")
len(demo_df)

FileNotFoundError: [Errno 2] No such file or directory: './2020_Neighborhood_Tabulation_Areas_(NTAs)_20251202.csv'

In [14]:
demo_df = demo_df[0].df
demo_df.head(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,,,,,,,,,,,,"% Race/Ethnicity,",,,
1,,,,,Total,,,,,,,,,,
2,,,Boro,,,65+ Population in NTA,,% 65+ Below,,,,all Population in NTA,,,
3,Neighborhood Tabulation Area (NTA) Name,NTA Code,,Boro CD,populatio,,,,,,,,,,
4,,,Name,,,,% 65+,poverty,% Hispanic/,,,% Black/ African,,,
5,,,,,n,# 65+ years,,,,,% White,,,% Asian,% Other
6,,,,,,,,,Latino,,,American,,,
7,,,,,,years,,,,,,,,,
8,Claremont-Bathgate,BX01,Bronx,203,35560,2692,7.6,42.1,,60.6,1,,37.3,0.1,1.0%
9,Eastchester-Edenwald-Baychester,BX03,Bronx,212,37887,4939,13,11.7,,24.6,3.1,,66.0,4.2,2.2%


In [15]:
demo_df.tail(15)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
25,Allerton-Pelham Gardens,BX31,Bronx,211,31993,6274,19.6,11.7,,36.3,21.0,,31.0,9.8,1.8%
26,Longwood,BX33,Bronx,202,27864,2536,9.1,42.8,,69.9,1.7,,25.7,1.5,1.1%
27,Melrose South-Mott Haven North,BX34,Bronx,201,42882,4062,9.5,40.1,,65.6,1.7,,30.4,0.6,1.7%
28,Morrisania-Melrose,BX35,Bronx,203,40925,3621,8.8,31.8,,58.3,1.2,,38.2,1.0,1.4%
29,University Heights-Morris Heights,BX36,Bronx,205,56293,4749,8.4,45.3,,68.9,1.1,,27.8,1.5,0.6%
30,Van Nest-Morris Park-Westchester Square,BX37,Bronx,211,29004,3344,11.5,14.2,,51.5,24.3,,11.8,10.0,2.5%
31,Mott Haven-Port Morris,BX39,Bronx,201,54163,4708,8.7,46.6,,68.8,1.9,,28.0,0.4,0.9%
32,Fordham South,BX40,Bronx,205,28164,1853,6.6,43.8,,71.6,0.7,,25.3,0.4,1.9%
33,Mount Hope,BX41,Bronx,205,53198,4661,8.8,36.6,,69.0,1.3,,26.4,1.4,1.8%
34,Norwood,BX43,Bronx,207,42227,3855,9.1,29.9,,63.5,9.7,,13.5,10.7,2.5%


Great. Got the data loaded. Now to clean up the column names and drop that empty column.

In [16]:
# drop first 8 rows
demo_clean = demo_df.drop(index=range(8)).reset_index(drop=True)

# drop column with index 8 and 11
demo_clean = demo_clean.drop(demo_clean.columns[[8, 11]], axis=1)

# fix column names
col_names = [
    "NTA Name", # Neighborhood Tabulation Area (NTA)
    "NTA Code",
    "Borough",
    "Borough CD",
    "Total Population",
    "65+ Population",
    "% 65+ Population",
    "% 65+ Below Poverty",
    "% Hispanic/Latino",
    "% White",
    "% Black/African American",
    "% Asian",
    "% Other"
]

demo_clean.columns = col_names

print(demo_clean.head())

                          NTA Name NTA Code Borough Borough CD  \
0               Claremont-Bathgate     BX01   Bronx        203   
1  Eastchester-Edenwald-Baychester     BX03   Bronx        212   
2       Bedford Park-Fordham North     BX05   Bronx        207   
3                          Belmont     BX06   Bronx        206   
4                        Bronxdale     BX07   Bronx        211   

  Total Population 65+ Population % 65+ Population % 65+ Below Poverty  \
0           35,560          2,692              7.6                42.1   
1           37,887          4,939               13                11.7   
2           57,685          4,993              8.7                  30   
3           29,115          2,197              7.5                28.9   
4           39,423          4,780             12.1                23.2   

  % Hispanic/Latino % White % Black/African American % Asian % Other  
0              60.6       1                     37.3     0.1    1.0%  
1              2

In [17]:
len(demo_clean)

32

In [18]:
print(tab_areas.head())

   BoroCode  BoroName  CountyFIPS NTA2020             NTAName  NTAAbbrev  \
0         3  Brooklyn          47  BK0101          Greenpoint      Grnpt   
1         3  Brooklyn          47  BK0102        Williamsburg   Wllmsbrg   
2         3  Brooklyn          47  BK0103  South Williamsburg  SWllmsbrg   
3         3  Brooklyn          47  BK0104   East Williamsburg  EWllmsbrg   
4         3  Brooklyn          47  BK0201    Brooklyn Heights      BkHts   

   NTAType CDTA2020                                           CDTAName  \
0        0     BK01     BK01 Williamsburg-Greenpoint (CD 1 Equivalent)   
1        0     BK01     BK01 Williamsburg-Greenpoint (CD 1 Equivalent)   
2        0     BK01     BK01 Williamsburg-Greenpoint (CD 1 Equivalent)   
3        0     BK01     BK01 Williamsburg-Greenpoint (CD 1 Equivalent)   
4        0     BK02  BK02 Downtown Brooklyn-Fort Greene (CD 2 Appro...   

     Shape_Length       Shape_Area  \
0  28,919.5611508  35,321,808.3909   
1  28,134.0826611  28,

In [19]:
print(tab_areas.tail())

     BoroCode       BoroName  CountyFIPS NTA2020                      NTAName  \
257         5  Staten Island          85  SI0391      Freshkills Park (South)   
258         5  Staten Island          85  SI9561               Fort Wadsworth   
259         5  Staten Island          85  SI9591  Hoffman & Swinburne Islands   
260         5  Staten Island          85  SI9592                 Miller Field   
261         5  Staten Island          85  SI9593             Great Kills Park   

      NTAAbbrev  NTAType CDTA2020  \
257  FrshklPK_S        9     SI03   
258   FtWdswrth        6     SI95   
259    HffmnIsl        9     SI95   
260     MllrFld        9     SI95   
261    GrtKlsPk        9     SI95   

                                              CDTAName    Shape_Length  \
257              SI03 South Shore (CD 3 Approximation)  33,945.4197532   
258  SI95 Great Kills Park-Fort Wadsworth (JIA 95 A...  14,814.4135523   
259  SI95 Great Kills Park-Fort Wadsworth (JIA 95 A...  4,743.127776

So I had to research what was in the "the_geom" column. Apparently, it is WKT, which stands for Well-known text, and is basically a representation of spacial objects like polygons in a text based format. From what I read, it is primarily to hold this spacial data as general (intermediary) format that can then be converted to the type you need, in our case GeoJSON (for Plotly). It is also used to store geometry in files like the column of a CSV. 

Interestingly, Multipolygon seems like more than one polygon in a row, which I would think could be done with a single polygon for each neighborhood. Unless there are neighborhoods that have something separating them into two distinct shapes? Because on Wikipedia, their example using Polygon has one that includes a polygon with another polygon inside it, cutting out a hole like a donut. Whatever the difference is, I hope it works for my choropleth.

https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry#Geometric_objects

In [24]:
print(tab_areas.columns)
print(demo_clean.columns)

Index(['BoroCode', 'BoroName', 'CountyFIPS', 'NTA2020', 'NTAName', 'NTAAbbrev',
       'NTAType', 'CDTA2020', 'CDTAName', 'Shape_Length', 'Shape_Area',
       'the_geom'],
      dtype='object')
Index(['NTA Name', 'NTA Code', 'Borough', 'Borough CD', 'Total Population',
       '65+ Population', '% 65+ Population', '% 65+ Below Poverty',
       '% Hispanic/Latino', '% White', '% Black/African American', '% Asian',
       '% Other'],
      dtype='object')


It was at this point I was curious if there was an easier way to complete this task. And there was, ArcGIS to the rescue. It had the exact file I was looking for, a GeoJSON shape file for NTA in NYC for 2020. So much easier. 

I also wanted to find data that had the NTA 2020 codes in it, since some of my previous data had older NTA codes (they change the NTA areas every 10 years during the Census to make sure the NTA areas stay in a certain range of population) or different codes entirely. And just like ArcGIS, there was a file that had exactly what I needed.

My lesson: LOOK FOR MORE DATA. It takes time and is a little annoying, but some extra time investment up front really can save a lot of difficulty later on. I also think it is fair that I looked for more data as I was working on my task. Sometimes, it is hard to know exactly what you need until you start working. 

So I won't be using that PDF I cleaned, but it was interesting learning that there is a library made for getting data from PDFs. That may come in handy some day. 

I won't be using the other file I was working on either, which is fine.

I will keep this notebook as a record, but continue my work in a fresh notebook. 