In [1]:
import json
import os
import shutil
from datetime import date, timedelta

import pandas as pd
import requests
from google.cloud import storage

### API documentation
https://dev.socrata.com/docs/filtering

In [None]:
"""Download the latest additions and updates to the NYC Open Data portal."""
from __future__ import annotations

from datetime import datetime
from pathlib import Path

import click
import pytz
import requests
from retry import retry
from rich import print

from . import utils


@click.command()
@click.option(
    "--verbose",
    "-v",
    is_flag=True,
    default=False,
    help="Print verbose output.",
)
def download(verbose: bool) -> None:
    """Download the latest additions and updates to the NYC Open Data portal."""
    this_dir = Path(__file__).parent.absolute()
    data_dir = this_dir.parent / "data" / "raw"

    # Get the current time
    tz = pytz.timezone("America/New_York")
    now = datetime.now(tz=tz)

    # Get the data
    data = get_url(verbose=verbose)

    # Write them out
    if verbose:
        print(f"Writing to [bold]{data_dir}[/bold]")
    utils.write_json(data["results"], data_dir / f"{now.isoformat()}.json")
    utils.write_json(data["results"], data_dir / "latest.json")


@retry()
def get_url(
    domains: str | list[str] = "data.cityofnewyork.us",
    search_context: str = "data.cityofnewyork.us",
    limit: int = 10000,
    order: str = "updatedAt",
    verbose: bool = True,
) -> dict:
    """Connect with the Socrata API."""
    url = "http://api.us.socrata.com/api/catalog/v1"
    params = {
        "domains": domains,
        "search_context": search_context,
        "limit": limit,
        "order": order,
    }
    if verbose:
        print(f"Downloading [bold]{url}[/bold]")
    response = requests.get(url, params=params)
    response.raise_for_status()
    data = response.json()
    if verbose:
        print(f"Found [bold]{len(data['results'])}[/bold] results.")
    return data


if __name__ == "__main__":
    download()

In [2]:
# pedestrian crash API endpoint: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Person/f55k-p6yu/about_data
url = "https://data.cityofnewyork.us/resource/f55k-p6yu.json?$order=crash_date DESC&$limit=25"

In [3]:
api_key, secret_key = os.environ['NYCT_API_KEY'], os.environ['NYCT_SECRET_KEY']
r = requests.get(url, auth=(api_key, secret_key))
r.status_code

200

In [4]:
d = r.json()
print(len(d))
d

25


[{'unique_id': '12979421',
  'collision_id': '4725105',
  'crash_date': '2024-05-14T00:00:00.000',
  'crash_time': '17:10',
  'person_id': 'a85b0abc-51ad-428a-afe8-113598e37c48',
  'person_type': 'Occupant',
  'person_injury': 'Unspecified',
  'vehicle_id': '20651225',
  'ped_role': 'Registrant'},
 {'unique_id': '12976605',
  'collision_id': '4724633',
  'crash_date': '2024-05-14T00:00:00.000',
  'crash_time': '16:57',
  'person_id': '645b9400-9e3c-4182-ab64-513345a2456d',
  'person_type': 'Occupant',
  'person_injury': 'Unspecified',
  'vehicle_id': '20649755',
  'person_age': '20',
  'ejection': 'Not Ejected',
  'emotional_status': 'Does Not Apply',
  'bodily_injury': 'Does Not Apply',
  'position_in_vehicle': 'Driver',
  'safety_equipment': 'Unknown',
  'complaint': 'Does Not Apply',
  'ped_role': 'Driver',
  'person_sex': 'M'},
 {'unique_id': '12978662',
  'collision_id': '4724811',
  'crash_date': '2024-05-14T00:00:00.000',
  'crash_time': '10:16',
  'person_id': 'fc31ecac-61df-45

In [5]:
# motor vehicle collision API endpoint: https://data.cityofnewyork.us/Public-Safety/Motor-Vehicle-Collisions-Crashes/h9gi-nx95/about_data
c_url = "https://data.cityofnewyork.us/resource/h9gi-nx95.json?$order=crash_date DESC&$limit=25"

In [6]:
r2 = requests.get(c_url, auth=(api_key, secret_key))
r2.status_code

200

In [7]:
d2 = r2.json()
print(len(d2))
d2

25


[{'crash_date': '2024-05-14T00:00:00.000',
  'crash_time': '9:50',
  'borough': 'BROOKLYN',
  'zip_code': '11211',
  'latitude': '40.713017',
  'longitude': '-73.93632',
  'location': {'latitude': '40.713017',
   'longitude': '-73.93632',
   'human_address': '{"address": "", "city": "", "state": "", "zip": ""}'},
  'cross_street_name': '951       GRAND STREET',
  'number_of_persons_injured': '2',
  'number_of_persons_killed': '0',
  'number_of_pedestrians_injured': '0',
  'number_of_pedestrians_killed': '0',
  'number_of_cyclist_injured': '0',
  'number_of_cyclist_killed': '0',
  'number_of_motorist_injured': '2',
  'number_of_motorist_killed': '0',
  'contributing_factor_vehicle_1': 'Driver Inattention/Distraction',
  'contributing_factor_vehicle_2': 'Driver Inattention/Distraction',
  'collision_id': '4724746',
  'vehicle_type_code1': 'Box Truck',
  'vehicle_type_code2': 'Sedan'},
 {'crash_date': '2024-05-14T00:00:00.000',
  'crash_time': '22:30',
  'latitude': '40.584557',
  'longit

### TODO

Set up integrations for these datasets. Run them daily. Write to cloud storage. Seems like they're up to date as of a few days ago (When I pulled around midnight on May 17, most recent data showed May 13). Partition by day.

After tables are created, join them to create more exhaustive dataset. Motor vehicle collisions have lat/long data. Questions for EDA:
- what does a map viz look like
- is there a location where accidents happen more often
- people stats? age/bodily injury/death
- what type of vehicle?
- are certain vehicles more prone to pedestrian death (trucks obvi).

Put a streamlit dashboard together that shows the stuff you want to display.

After all that, what is something we can predict? and how can we operationalize it? Apply some of the skills you learned in the MLOps class.

In [17]:
r3 = requests.get('https://data.cityofnewyork.us/resource/h9gi-nx95.json?crash_date=2023-11-22T00:00:00.000', auth=(api_key, secret_key))
print(r3.status_code)
print(len(r3.json()))
r3.json()

200
295


[{'crash_date': '2023-11-22T00:00:00.000',
  'crash_time': '4:52',
  'borough': 'BRONX',
  'zip_code': '10462',
  'cross_street_name': '2149      GLEASON AVENUE',
  'number_of_persons_injured': '0',
  'number_of_persons_killed': '0',
  'number_of_pedestrians_injured': '0',
  'number_of_pedestrians_killed': '0',
  'number_of_cyclist_injured': '0',
  'number_of_cyclist_killed': '0',
  'number_of_motorist_injured': '0',
  'number_of_motorist_killed': '0',
  'contributing_factor_vehicle_1': 'Unspecified',
  'collision_id': '4681370',
  'vehicle_type_code1': 'Sedan'},
 {'crash_date': '2023-11-22T00:00:00.000',
  'crash_time': '13:25',
  'latitude': '40.720005',
  'longitude': '-73.78463',
  'location': {'latitude': '40.720005',
   'longitude': '-73.78463',
   'human_address': '{"address": "", "city": "", "state": "", "zip": ""}'},
  'on_street_name': 'GRAND CENTRAL PKWY',
  'number_of_persons_injured': '0',
  'number_of_persons_killed': '0',
  'number_of_pedestrians_injured': '0',
  'number

In [20]:
# download data. arg inputs?
dt = (date.today() - timedelta(days=5)).strftime('%Y-%m-%d')

endpoints = {
    "pedestrian": "f55k-p6yu.json",
    "collision": "h9gi-nx95.json"
}

'2024-05-14'

In [27]:
def download_data(dt, endpoint):
    base_url = "https://data.cityofnewyork.us/resource/"
    endpoints = {
        "pedestrian": "f55k-p6yu.json",
        "collision": "h9gi-nx95.json"
    }
    ep = endpoints[endpoint]
    params = f"?crash_date={dt}T00:00:00.000"
    url = base_url + ep + params
    response = requests.get(url, auth=(api_key, secret_key))
    response.raise_for_status()
    return response.json()

c = download_data('2024-01-01', 'collision')
print(len(c))
c[:1]

246


[{'crash_date': '2024-01-01T00:00:00.000',
  'crash_time': '18:34',
  'on_street_name': 'CHURCH AVENUE',
  'off_street_name': 'OCEAN PARKWAY',
  'number_of_persons_injured': '0',
  'number_of_persons_killed': '0',
  'number_of_pedestrians_injured': '0',
  'number_of_pedestrians_killed': '0',
  'number_of_cyclist_injured': '0',
  'number_of_cyclist_killed': '0',
  'number_of_motorist_injured': '0',
  'number_of_motorist_killed': '0',
  'contributing_factor_vehicle_1': 'Unspecified',
  'contributing_factor_vehicle_2': 'Unspecified',
  'collision_id': '4691979',
  'vehicle_type_code1': 'Station Wagon/Sport Utility Vehicle',
  'vehicle_type_code2': 'Station Wagon/Sport Utility Vehicle'}]

In [55]:
def create_tmp_dir(dt, data):
    "Create a tmp directory after downloading data"
    data_dir = os.path.join('..', 'data', 'tmp', dt)
    
    # create dir if it doesn't exist
    os.makedirs(data_dir, exist_ok=True)
    
    # write the data to a json file
    file_path = os.path.join(data_dir, 'data.json')
    with open(file_path, 'w') as out:
        for record in data:
            json.dump(record, out)
            out.write('\n')
    print(f"data saved to {file_path}")

data saved to ../data/tmp/2024-01-01/data.json


In [67]:
def sync_to_gcs(dt, tmp_dir):
    source_file = tmp_dir # "../data/tmp/2024-01-01/data.json"
    destination_blob_name = f"motor-vehicle-crashes/collisions/{dt}/data.json"
    storage_client = storage.Client()
    bucket = storage_client.bucket('machine-learning-workspace')
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file)
    print(f"File {source_file} uploaded to {destination_blob_name}.")

File ../data/tmp/2024-01-01/data.json uploaded to motor-vehicle-crashes/collisions/2024-01-01/data.json.


In [73]:
# delete local file.
if os.path.isfile(file_path):
    os.remove(file_path)
    print(f"File {file_path} deleted.")
else:
    print(f"File {file_path} not found.")

OSError: [Errno 66] Directory not empty: '../data/tmp/2024-01-01'

In [71]:
def remove_local_dir(tmp_dir):
    if path.exists(tmp_dir):
        shutil.rmtree(tmp_dir)

'../data/tmp/2024-01-01'

### TODO 2024-05-20
- Refactor
- Recreate cloud composer environment now that you've updated permissions.
- Cloud Composer discovery.