In [1]:
#Import the required packages
#Import package pandas for data analysis
import pandas as pd

# Import package numpy for numeric computing
import numpy as np

# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt

#For showing plots directly in the notebook run the command below
%matplotlib inline

# For saving multiple plots into a single pdf file
from matplotlib.backends.backend_pdf import PdfPages 

import geopandas as gpd

from shapely.geometry import Point

from haversine import haversine

In [2]:
df = pd.read_csv('datasets/1.Transit_Access_Mobility_Score/Subway_Access_Score/MTA_Subway_Entrances_and_Exits__2024_20250524.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
#How many rows should be displayed in full
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Show data frame first few rows
df.head()

Unnamed: 0,Division,Line,Borough,Stop Name,Complex ID,Constituent Station Name,Station ID,GTFS Stop ID,Daytime Routes,Entrance Type,Entry Allowed,Exit Allowed,Entrance Latitude,Entrance Longitude,entrance_georeference
0,BMT,4th Av,B,Atlantic Av-Barclays Ctr,617,Atlantic Av-Barclays Ctr,27,R31,2 3 4 5 B D N Q R,Stair,YES,YES,40.683905,-73.978879,POINT (-73.978879 40.683905)
1,BMT,4th Av,B,Atlantic Av-Barclays Ctr,617,Atlantic Av-Barclays Ctr,27,R31,2 3 4 5 B D N Q R,Elevator,YES,YES,40.683805,-73.978487,POINT (-73.978487 40.683805)
2,BMT,4th Av,B,Atlantic Av-Barclays Ctr,617,Atlantic Av-Barclays Ctr,27,R31,2 3 4 5 B D N Q R,Stair,YES,YES,40.683928,-73.978412,POINT (-73.978412 40.683928)
3,BMT,4th Av,B,Union St,28,Union St,28,R32,R,Stair,YES,YES,40.677154,-73.98343,POINT (-73.9834296 40.6771544)
4,BMT,4th Av,B,Union St,28,Union St,28,R32,R,Stair,YES,YES,40.677296,-73.983336,POINT (-73.9833364 40.6772958)


For the Mobility Score we need the subway access. Therefore the following fetaures are relevant:
- Stop Name
- Entry Allowed
- Emtrance Type (for accessibility)
- Daytime Routes
- Entrance Latitude
- Entrance Longitude

In [3]:
#filter for Manhatten first

df['Borough'].unique()


array(['B', 'Q', 'M', 'Bx', 'SI'], dtype=object)

In [4]:
df = df[df['Borough']== 'M']

In [5]:
df.shape

(868, 15)

In [6]:
df = df.drop('Borough', axis=1)

In [7]:
#only select relevant features
df = df[['Stop Name','Entry Allowed', 'Entrance Type', 'Daytime Routes','Exit Allowed', 'Entrance Latitude', 'Entrance Longitude']]

In [8]:
#change datatype
df.dtypes

Stop Name              object
Entry Allowed          object
Entrance Type          object
Daytime Routes         object
Exit Allowed           object
Entrance Latitude     float64
Entrance Longitude    float64
dtype: object

In [9]:
df.head()

Unnamed: 0,Stop Name,Entry Allowed,Entrance Type,Daytime Routes,Exit Allowed,Entrance Latitude,Entrance Longitude
52,Roosevelt Island,YES,Station House,F,YES,40.759019,-73.953458
53,Lexington Av/63 St,YES,Easement - Street,F Q,YES,40.764968,-73.966679
54,Lexington Av/63 St,YES,Stair,F Q,YES,40.764738,-73.966553
55,Lexington Av/63 St,YES,Easement - Street,F Q,YES,40.764896,-73.966426
56,Lexington Av/63 St,YES,Stair,F Q,YES,40.764101,-73.965041


In [10]:
#check for duplicates
df[df.duplicated()]

Unnamed: 0,Stop Name,Entry Allowed,Entrance Type,Daytime Routes,Exit Allowed,Entrance Latitude,Entrance Longitude


In [11]:
df.isnull().sum()

Stop Name             0
Entry Allowed         0
Entrance Type         0
Daytime Routes        0
Exit Allowed          0
Entrance Latitude     0
Entrance Longitude    0
dtype: int64

In [12]:
df.shape

(868, 7)

In [9]:
# Load the full MapPLUTO shapefile
pluto = gpd.read_file('datasets/1.Transit_Access_Mobility_Score/Subway_Access_Score/nyc_mappluto_25v1_1_shp/MapPLUTO.shp')

In [10]:
pluto.shape

(856734, 95)

In [11]:
# Filter only valid geometries
pluto = pluto[pluto.is_valid]

# Filter for Manhattan only
pluto = pluto[pluto["Borough"] == "MN"]  

In [12]:
pluto.nunique()

Borough           1
Block          1960
Lot             551
CD               14
BCT2020         309
BCTCB2020      3231
CT2010          287
CB2010          142
SchoolDist        7
Council          10
ZipCode          73
FireComp         76
PolicePrct       25
HealthCent        8
HealthArea       88
Sanitboro         2
SanitDistr       12
SanitSub         27
Address       41495
ZoneDist1       105
ZoneDist2        90
ZoneDist3        38
ZoneDist4         3
Overlay1          7
Overlay2          3
SPDist1          30
SPDist2           7
SPDist3           0
LtdHeight         2
SplitZone         2
BldgClass       196
LandUse          11
Easements         5
OwnerType         5
OwnerName     33215
LotArea       10344
BldgArea      20188
ComArea       10098
ResArea       15674
OfficeArea     4214
RetailArea     5541
GarageArea     1096
StrgeArea       936
FactryArea      111
OtherArea      2890
AreaSource        5
NumBldgs         29
NumFloors       117
UnitsRes        571
UnitsTotal      587


In [13]:
#Drop irrelevant columns
pluto = pluto.drop(['Borough','Sanitboro','SanitDistr','SanitSub',
                    'SPDist1','SPDist2','SPDist3','LtdHeight','SplitZone',
                    'IrrLotCode','BoroCode','ZMCode','Shape_Leng','Notes',
                    'DCPEdited','Version','PFIRM15_FL','FIRM07_FLA'], axis=1)

In [14]:
print(pluto.columns)

Index(['Block', 'Lot', 'CD', 'BCT2020', 'BCTCB2020', 'CT2010', 'CB2010',
       'SchoolDist', 'Council', 'ZipCode', 'FireComp', 'PolicePrct',
       'HealthCent', 'HealthArea', 'Address', 'ZoneDist1', 'ZoneDist2',
       'ZoneDist3', 'ZoneDist4', 'Overlay1', 'Overlay2', 'BldgClass',
       'LandUse', 'Easements', 'OwnerType', 'OwnerName', 'LotArea', 'BldgArea',
       'ComArea', 'ResArea', 'OfficeArea', 'RetailArea', 'GarageArea',
       'StrgeArea', 'FactryArea', 'OtherArea', 'AreaSource', 'NumBldgs',
       'NumFloors', 'UnitsRes', 'UnitsTotal', 'LotFront', 'LotDepth',
       'BldgFront', 'BldgDepth', 'Ext', 'ProxCode', 'LotType', 'BsmtCode',
       'AssessLand', 'AssessTot', 'ExemptTot', 'YearBuilt', 'YearAlter1',
       'YearAlter2', 'HistDist', 'Landmark', 'BuiltFAR', 'ResidFAR', 'CommFAR',
       'FacilFAR', 'BBL', 'CondoNo', 'Tract2010', 'XCoord', 'YCoord',
       'ZoneMap', 'Sanborn', 'TaxMap', 'EDesigNum', 'APPBBL', 'APPDate',
       'PLUTOMapID', 'Latitude', 'Longitude', 'Sha

In [15]:

# Dissolve parcels into one geometry per Census Tract
tracts = pluto.dissolve(by="BCT2020")

# Compute centroids
tracts["centroid"] = tracts.geometry.centroid

# Extract lat/lon from the centroid
tracts["latitude"] = tracts["centroid"].y
tracts["longitude"] = tracts["centroid"].x

In [16]:
centroids_df = gpd.GeoDataFrame(
    tracts,
    geometry=tracts["centroid"],
    crs=tracts.crs
)

In [17]:
#Computing Nearby Access
def count_entrances_near(lat, lon, entrances, radius_m=500):
    count = 0
    unique_lines = set()
    ada_count = 0
    for _, row in entrances.iterrows():
        dist = haversine(
            (lat, lon), 
            (row["Entrance Latitude"], row["Entrance Longitude"]),
            unit="m"
        )
        if dist <= radius_m:
            count += 1
            unique_lines.update(str(row["Daytime Routes"]).split())
            if "elevator" in str(row["Entrance Type"]).lower():
                ada_count += 1
    return count, len(unique_lines), ada_count


In [18]:
# Reproject centroids to EPSG:4326 
centroids_df = centroids_df.to_crs(epsg=4326)

# Now extract lat/lon from reprojected geometry
centroids_df["latitude"] = centroids_df.geometry.y
centroids_df["longitude"] = centroids_df.geometry.x


In [19]:
# Storage for scores
results = []

for idx, row in centroids_df.iterrows():
    lat, lon = row["latitude"], row["longitude"]
    count, unique_lines, ada_count = count_entrances_near(lat, lon, df)
    results.append((count, unique_lines, ada_count))

centroids_df[["entrance_count", "line_count", "accessible_count"]] = pd.DataFrame(results, index=centroids_df.index)


In [20]:
def normalize(series):
    min_val = series.min()
    max_val = series.max()
    return (series - min_val) / (max_val - min_val) if max_val != min_val else 0

In [21]:
#Get Subway Score
# Normalize each feature separately
norm_entrances = normalize(centroids_df["entrance_count"])
norm_lines = normalize(centroids_df["line_count"])
norm_ada = normalize(centroids_df["accessible_count"])

#Weighted sum of normalized values, then scale to [1, 10]
centroids_df["subway_score"] = (
    0.5 * norm_entrances + 0.3 * norm_lines + 0.2 * norm_ada
) * 9 + 1


In [22]:
centroids_df.head()

Unnamed: 0_level_0,geometry,Block,Lot,CD,BCTCB2020,CT2010,CB2010,SchoolDist,Council,ZipCode,FireComp,PolicePrct,HealthCent,HealthArea,Address,ZoneDist1,ZoneDist2,ZoneDist3,ZoneDist4,Overlay1,Overlay2,BldgClass,LandUse,Easements,OwnerType,OwnerName,LotArea,BldgArea,ComArea,ResArea,OfficeArea,RetailArea,GarageArea,StrgeArea,FactryArea,OtherArea,AreaSource,NumBldgs,NumFloors,UnitsRes,UnitsTotal,LotFront,LotDepth,BldgFront,BldgDepth,Ext,ProxCode,LotType,BsmtCode,AssessLand,AssessTot,ExemptTot,YearBuilt,YearAlter1,YearAlter2,HistDist,Landmark,BuiltFAR,ResidFAR,CommFAR,FacilFAR,BBL,CondoNo,Tract2010,XCoord,YCoord,ZoneMap,Sanborn,TaxMap,EDesigNum,APPBBL,APPDate,PLUTOMapID,Latitude,Longitude,Shape_Area,centroid,latitude,longitude,entrance_count,line_count,accessible_count,subway_score
BCT2020,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1
1000100,POINT (-74.04224 40.69604),1,101,101,10001001001,1.0,1001,2,1,10004,E007,1,15,8200,1 LIBERTY ISLAND,R3-2,,,,,,P7,8,0,X,U S GOVT LAND & BLDGS,541886,541886,541886,0,0,0,0,0,0,541886,2,10,0.0,0,0,500.0,1046.0,0.0,0.0,N,0,5,0,4225950.0,14355900.0,14355900.0,1900,0,0,Individual Landmark,INDIVIDUAL LANDMARK,1.0,0.5,0.0,1.0,1000010000.0,0,1,971677,190636,16a,199 999,10101,,0.0,,1,40.68992,-74.045337,501897.3,POINT (972538.031 192864.582),40.696038,-74.042236,0,0,0,1.0
1000201,POINT (-73.98589 40.71128),245,1,103,10002012001,2.01,2001,1,1,10002,E015,7,14,8000,286 SOUTH STREET,C6-4,R7-2,,,,,D6,4,0,O,NYC HOUSING AUTHORITY,31654,277260,2000,275260,0,2000,0,0,0,0,2,1,27.0,250,252,162.08,197.17,161.0,160.0,N,1,3,2,1057500.0,26404200.0,26404200.0,1975,1989,0,,,8.76,10.0,10.0,10.0,1002450000.0,0,201,988072,198270,12d,101N057,10106,,0.0,01/04/2022,1,40.710881,-73.986214,33793.1,POINT (988160.732 198415.139),40.71128,-73.985894,4,1,0,1.342105
1000202,POINT (-73.98107 40.71218),260,1,103,10002022001,2.02,3002,1,1,10002,E015,7,14,8000,636 WATER STREET,R7-2,R7-2,,,C1-5,,D9,3,0,O,NYC HOUSING AUTHORITY,261850,513000,0,513000,0,0,0,0,0,0,2,10,6.0,813,813,405.08,634.58,131.0,47.0,N,1,3,2,7294050.0,24886350.0,24886350.0,1940,1986,0,Individual Landmark,INDIVIDUAL LANDMARK,1.96,3.44,0.0,6.5,1002600000.0,0,202,988998,198801,12d,101N059,10106,,0.0,07/25/2023,1,40.712338,-73.982873,279412.5,POINT (989499.251 198744.109),40.712182,-73.981065,0,0,0,1.0
1000500,POINT (-74.01909 40.68877),1,10,101,10005000003,5.0,1000,2,1,10004,E007,1,15,8100,301 COMFORT ROAD,R3-2,C4-1,,,,,Y4,8,0,X,GOVERNORS ISLAND CORPORATION,7600835,2542066,2542066,0,0,0,0,0,0,2542066,2,133,3.5,0,1,0.0,0.0,0.0,0.0,N,0,5,1,102535195.0,160929900.0,160929900.0,1900,0,0,Governors Island Historic District,INDIVIDUAL LANDMARK,0.33,0.5,0.0,1.0,1000010000.0,0,5,979063,190216,16a,199 999,10101,,1000010000.0,09/20/2024,1,40.688774,-74.018704,7414502.0,POINT (978954.87 190215.53),40.688773,-74.019093,0,0,0,1.0
1000600,POINT (-73.98961 40.71217),246,1,103,10006006000,6.0,6000,1,1,10002,E015,7,14,7800,275 SOUTH STREET,C6-4,PARK,,,C1-5,,D6,4,0,X,275 NELSON APARTMENTS LLC,51180,262875,1000,261875,0,1000,0,0,0,0,2,1,19.0,256,258,361.58,152.0,60.0,235.0,N,1,1,0,1516950.0,33809400.0,33809400.0,1978,0,0,Individual Landmark,INDIVIDUAL LANDMARK,5.14,10.0,10.0,10.0,1002460000.0,0,6,987656,198206,12d,101N055,10106,E-489,1002460000.0,03/16/2011,1,40.710706,-73.987714,49958.4,POINT (987129.216 198738.594),40.712168,-73.989614,4,1,0,1.342105


In [23]:
#store cleaned data set as json 
df.to_csv('manhatten_subway_access_scores.csv', index=False)

# Spatial Joining to make sure it conforms with Census Tracts

In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

# --- Load subway access scores with centroids ---
df = pd.read_csv("manhatten_subway_access_scores.csv")
geometry = [Point(xy) for xy in zip(df["longitude"], df["latitude"])]
gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

# --- Load census tract geometries ---
tracts = gpd.read_file("../census tract geofiles/manhattan_census_tracts.geojson").to_crs(epsg=4326)
tracts = tracts[["GEOID", "geometry"]]

# --- Spatial join: assign each centroid a GEOID ---
joined = gpd.sjoin(gdf, tracts, how="inner", predicate="within")

# --- Group by GEOID and compute average subway score ---
subway_scores = joined.groupby("GEOID").agg({"subway_score": "mean"}).reset_index()

# --- Merge with full list of tracts to fill missing ones with 0 ---
all_tracts = tracts[["GEOID"]].drop_duplicates()
final_scores = all_tracts.merge(subway_scores, on="GEOID", how="left")
final_scores["subway_score"] = final_scores["subway_score"].fillna(0)

# --- Export ---
final_scores.to_csv("subway_score_by_tract.csv", index=False)
print("✅ Saved: subway_score_by_tract.csv with shape:", final_scores.shape)


✅ Saved: subway_score_by_tract.csv with shape: (310, 2)
