In [None]:
import pandas as pd
import requests
import json

In [None]:
INPUT_FILE = '../data/202501-citibike-tripdata_3.csv'
OUTPUT_FILE = '../data/202501-citibike-sample.csv'
TARGET_ROWS = 100000

print("Reading and sampling...")

df = pd.read_csv(INPUT_FILE)

print(f"Loaded {len(df)} rows.")

if len(df) > TARGET_ROWS:
    df = df.sample(n=TARGET_ROWS)

print(f"Saving {len(df)} rows to {OUTPUT_FILE}...")
df.to_csv(OUTPUT_FILE, index=False)
print("Done! You have a perfect random sample.")

Reading and sampling...
Loaded 124475 rows.
Saving 100000 rows to ../data/202501-citibike-sample.csv...
Done! You have a perfect random sample.


In [None]:
YEAR = "2022"
DATASET = "acs/acs5"
BASE_URL = f"https://api.census.gov/data/{YEAR}/{DATASET}"
VARIABLES = "NAME,B08201_001E,B08201_002E"

# NYC County Codes (FIPS): Bronx(005), Brooklyn(047), Manhattan(061), Queens(081), Staten Island(085)
counties = ["005", "047", "061", "081", "085"]
state_code = "36" # New York

all_data = []

print("Fetching data for...")
for county in counties:
    url = f"{BASE_URL}?get={VARIABLES}&for=tract:*&in=state:{state_code}&in=county:{county}"
    
    try:
        response = requests.get(url)
        data = response.json()
        headers = data[0]
        rows = data[1:]
        df_county = pd.DataFrame(rows, columns=headers)
        all_data.append(df_county)
        print(f"  - County {county}: Found {len(rows)} tracts")
        
    except Exception as e:
        print(f"  - Error fetching county {county}: {e}")

nyc_df = pd.concat(all_data, ignore_index=True)

nyc_df = nyc_df.rename(columns={
    "B08201_001E": "total_households",
    "B08201_002E": "no_vehicle_households",
    "state": "state_fips",
    "county": "county_fips",
    "tract": "tract_fips"
})

nyc_df["GEOID"] = nyc_df["state_fips"] + nyc_df["county_fips"] + nyc_df["tract_fips"]
nyc_df["total_households"] = pd.to_numeric(nyc_df["total_households"])
nyc_df["no_vehicle_households"] = pd.to_numeric(nyc_df["no_vehicle_households"])
nyc_df["pct_no_vehicle"] = nyc_df["no_vehicle_households"] / nyc_df["total_households"]

nyc_df.to_csv("../data/nyc_transit_equity.csv", index=False)
print("\nSuccess! Saved 'nyc_transit_equity.csv'")

Fetching data for...
  - County 005: Found 361 tracts
  - County 047: Found 805 tracts
  - County 061: Found 310 tracts
  - County 081: Found 725 tracts
  - County 085: Found 126 tracts

Success! Saved 'nyc_transit_equity.csv'


In [None]:
url = "https://data.cityofnewyork.us/resource/63ge-mke6.geojson?$limit=5000"

print(f"Downloading GeoJSON from {url}...")
response = requests.get(url)

if response.status_code == 200:
    data = response.json()
    count = len(data.get('features', []))
    print(f"Success! Downloaded {count} tracts.")
    
    if count > 0:
        with open('../data/nyc_tracts.geojson', 'w') as f:
            json.dump(data, f)
        print("Saved to 'nyc_tracts.geojson'. Move this file to your data/ folder.")
    else:
        print("Error: The downloaded file has 0 features.")
else:
    print(f"Error: Failed to download. Status code: {response.status_code}")

Downloading GeoJSON from https://data.cityofnewyork.us/resource/63ge-mke6.geojson?$limit=5000...
Success! Downloaded 2325 tracts.
Saved to 'nyc_tracts.geojson'. Move this file to your data/ folder.
