In [1]:
import pandas as pd
import requests
import sys

In [2]:
def download_inspections(outfile, start_date='2016-01-01', end_date='2018-12-31', limit=20):
    if not outfile:
        raise ValueError("No outfile specified.")


    DATA_URL = "https://data.cityofnewyork.us/resource/43nn-pn8j.json"

    query = f"""
        SELECT
            camis,
            dba,
            boro,
            zipcode,
            cuisine_description,
            inspection_date,
            action,
            violation_code,
            violation_description,
            critical_flag,
            score,
            grade,
            inspection_type,
            latitude,
            longitude
        WHERE
            inspection_date BETWEEN "{start_date}" AND "{end_date}"
        LIMIT {int(limit)}
    """
    
    r = requests.get(DATA_URL, params={"$query":query})
    rows = r.json()
    if not r.ok:
        print(rows)
    data = pd.DataFrame(rows)
    data.to_csv(outfile, index=False)
    
    return data

In [3]:
def download_311(outfile, start_date='2016-01-01', end_date='2018-12-31', limit=20):

    if not outfile:
        raise ValueError("No outfile specified.")

    DATA_URL = "https://data.cityofnewyork.us/resource/erm2-nwe9.json"
    
    query = f"""
        SELECT
            created_date,
            agency,
            complaint_type,
            descriptor,
            location_type,
            incident_zip,
            address_type,
            city,
            facility_type,
            borough,
            latitude,
            longitude
        WHERE
            agency in('DEP', 'DOHMH', 'DSNY')
            AND created_date BETWEEN "{start_date}" AND "{end_date}"
        LIMIT {int(limit)}
    """
    
    r = requests.get(DATA_URL, params={"$query":query})        
    rows = r.json()
    if not r.ok:
        print(rows)
    data = pd.DataFrame(rows)
    data.to_csv(outfile, index=False)
    
    return data

In [4]:
def download_business(outfile, start_date='2016-01-01', end_date='2018-12-31', limit=20):

    if not outfile:
        raise ValueError("No outfile specified.")

    DATA_URL = "https://data.cityofnewyork.us/resource/w7w3-xahh.json"
    
    query = f"""
        SELECT
            license_nbr as lic_id,
            license_status as status,
            lic_expir_dd as lic_expire,
            license_creation_date as lic_create,
            industry,
            business_name as name,
            address_zip,
            address_borough,
            latitude,
            longitude

        LIMIT {int(limit)}
    """
    
    r = requests.get(DATA_URL, params={"$query":query})        
    rows = r.json()
    if not r.ok:
        print(rows)
    data = pd.DataFrame(rows)
    data.to_csv(outfile, index=False)
    
    return data

In [5]:
def download_nypd_historic(outfile, start_date='2016-01-01', end_date='2018-12-31', limit=20):

    if not outfile:
        raise ValueError("No outfile specified.")

    DATA_URL = "https://data.cityofnewyork.us/resource/qgea-i56i.json"
    
    query = f"""
        SELECT
            cmplnt_fr_dt as complaint_date,
            ky_cd as complaint_code,
            ofns_desc as complaint_description,
            law_cat_cd as offense_lvl,
            boro_nm as boro,
            prem_typ_desc as premisis_type,
            latitude,
            longitude
        WHERE
            complaint_date BETWEEN "{start_date}" AND "{end_date}"
        LIMIT {int(limit)}
    """
    
    r = requests.get(DATA_URL, params={"$query":query})        
    rows = r.json()
    if not r.ok:
        print(rows)
    data = pd.DataFrame(rows)
    data.to_csv(outfile, index=False)
    
    return data

In [6]:
# df = download_inspections('./data/inspections.csv', limit=3e5)
# df = download_311('./data/311.csv', limit=3e5)
# df = download_business('./data/business.csv', limit=3e5)
df = download_nypd_historic('./data/nypd.csv', limit=3e5)

# Weather

For the historical weather, you need to download the data from NOAA. Use this website.

https://www.ncdc.noaa.gov/cdo-web/datasets/GHCND/stations/GHCND:USW00094728/detail

As far as I can tell there isn't an automatic way to get the weather data from this website. Although, I didn't look super hard.

Documentaion: https://www1.ncdc.noaa.gov/pub/data/cdo/documentation/GHCND_documentation.pdf

Here are the columns if you download all the data.

 Precipitation (PRCP)
 Snow depth (SNWD)
 Snowfall (SNOW)
 Maximum temperature (TMAX)
 Minimum temperature (TMIN)
 Average wind speed (AWND)
 Direction of fastest 2-minute wind (WDF2)
 Direction of fastest 5-second wind (WDF5)
 Fastest 2-minute wind speed (WSF2)
 Fastest 5-second wind speed (WSF5)
 Peak gust time (PGTM)
 Weather types (WT**)

Here are the weather types -- example WT01

01 = Fog, ice fog, or freezing fog (may include heavy fog)
02 = Heavy fog or heaving freezing fog (not always distinguished from fog)
03 = Thunder
04 = Ice pellets, sleet, snow pellets, or small hail
05 = Hail (may include small hail)
06 = Glaze or rime
07 = Dust, volcanic ash, blowing dust, blowing sand, or blowing obstruction
08 = Smoke or haze
09 = Blowing or drifting snow
10 = Tornado, waterspout, or funnel cloud
11 = High or damaging winds
12 = Blowing spray
13 = Mist
14 = Drizzle
15 = Freezing drizzle
16 = Rain (may include freezing rain, drizzle, and freezing drizzle)
17 = Freezing rain
18 = Snow, snow pellets, snow grains, or ice crystals
19 = Unknown source of precipitation
21 = Ground fog
22 = Ice fog or freezing fog

