In [12]:
import pandas as pd
import os 
from datetime import date, datetime, timedelta
import json
import yaml
from arcgis.features import FeatureLayer, GeoAccessor, GeoSeriesAccessor, Table

In [13]:
with open ("./data/sources.yml", "r") as yaml_file:
  data = yaml.safe_load(yaml_file)

crashes_fl = FeatureLayer(data['crashes_in_dc']['mapserver'])
crash_details_fl = FeatureLayer(data['crash_details']['mapserver'])

In [18]:
for f in crashes_fl.properties.fields:
  print(f)

{
  "name": "OBJECTID",
  "type": "esriFieldTypeOID",
  "alias": "OBJECTID",
  "domain": null
}
{
  "name": "CRIMEID",
  "type": "esriFieldTypeString",
  "alias": "CRASHID",
  "length": 25,
  "domain": null
}
{
  "name": "CCN",
  "type": "esriFieldTypeString",
  "alias": "CCN",
  "length": 8,
  "domain": null
}
{
  "name": "REPORTDATE",
  "type": "esriFieldTypeDate",
  "alias": "REPORTDATE",
  "length": 8,
  "domain": null
}
{
  "name": "ROUTEID",
  "type": "esriFieldTypeString",
  "alias": "RouteID",
  "length": 255,
  "domain": null
}
{
  "name": "MEASURE",
  "type": "esriFieldTypeDouble",
  "alias": "Measure",
  "domain": null
}
{
  "name": "OFFSET",
  "type": "esriFieldTypeDouble",
  "alias": "OFFSET",
  "domain": null
}
{
  "name": "STREETSEGID",
  "type": "esriFieldTypeDouble",
  "alias": "STREETSEGID",
  "domain": null
}
{
  "name": "ROADWAYSEGID",
  "type": "esriFieldTypeDouble",
  "alias": "ROADWAYSEGID",
  "domain": null
}
{
  "name": "FROMDATE",
  "type": "esriFieldTypeDate"

The most frequent update interval for our datasets is daily. Most are updated weekly or irregularly, but we will schedule our DAG to check for daily updates. 

In [39]:
# The query syntax in the REST API for ARCGIS is inflexible -- we will work around the function we know works. 
# Simply subtracting a day misses entries which occurred yesterday but earlier in the day than the current time.
# Subtracting 2 days includes these but also some dates from the day before.
# We start with 2 and then pare down to just yesterday.
result = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8") 

In [40]:
from time_utils import ms, midnight

# Get midnight today and midnight yesterday in miliseconds from 1970-01-01
t1 = ms(midnight(date.today()-timedelta(days=7)))
t2 = ms(midnight(date.today()))

# Check against REPORTDATE in x.features
incidents = [{'geometry':f.geometry, 'attributes':f.attributes} 
      for f in result.features if t1 < f.attributes['REPORTDATE'] < t2] 

# Push to XCOM
# ----------------------------------------
# Read array from XCOM 

# Convert to Dataframe
df = pd.json_normalize(incidents)
df.columns = [col.split('.')[1] for col in df] # remove column prefixes 
df['REPORTDATE'] = df['REPORTDATE'].map(lambda v: datetime(1970,1,1) + timedelta(seconds=v/1000))
# Push to BIGQUERY
df.sort_values('REPORTDATE')

Unnamed: 0,x,y,spatialReference,spatialReference.1,OBJECTID,CRIMEID,CCN,REPORTDATE,ROUTEID,MEASURE,...,LASTUPDATEDATE,MPDLATITUDE,MPDLONGITUDE,MPDGEOX,MPDGEOY,FATALPASSENGER,MAJORINJURIESPASSENGER,MINORINJURIESPASSENGER,UNKNOWNINJURIESPASSENGER,MAR_ID
16,-8.580656e+06,4.708388e+06,102100,3857,161524931,41154106591,23012888,2023-01-25 00:46:00,11037892,255.94,...,1.674834e+12,38.907106,-77.081539,,,0.0,0.0,0.0,0.0,271384.0
17,-8.570687e+06,4.703996e+06,102100,3857,161524932,41154151483,23012839,2023-01-25 00:55:00,13057852,1502.29,...,,38.876326,-76.991784,,,0.0,0.0,0.0,0.0,311850.0
18,-8.569333e+06,4.712118e+06,102100,3857,161524933,41154209309,23012884,2023-01-25 01:07:00,12001802,2640.44,...,1.674920e+12,38.933250,-76.979874,,,0.0,0.0,1.0,0.0,150714.0
19,-8.574494e+06,4.708531e+06,102100,3857,161524934,41154384585,23012937,2023-01-25 01:46:00,11001002,1789.24,...,1.674920e+12,38.908197,-77.026192,,,0.0,0.0,0.0,0.0,239325.0
20,-8.568726e+06,4.713110e+06,102100,3857,161524935,41154390942,23012271,2023-01-25 01:48:00,12002202,2263.72,...,1.674920e+12,38.940195,-76.974391,,,0.0,0.0,0.0,0.0,26904.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,-8.579930e+06,4.716461e+06,102100,3857,161525097,41264853605,23014998,2023-01-28 18:36:00,11025152,7620.69,...,,38.963494,-77.075125,,,0.0,0.0,0.0,0.0,299971.0
183,-8.568453e+06,4.703635e+06,102100,3857,161525098,41265177803,23015063,2023-01-28 20:00:00,13069812,3150.92,...,,38.874249,-76.971515,,,0.0,0.0,0.0,0.0,295223.0
184,-8.565659e+06,4.707112e+06,102100,3857,161525099,41265308370,23015078,2023-01-28 20:31:00,12061162,1103.52,...,,38.898199,-76.946300,,,0.0,0.0,0.0,0.0,26428.0
185,-8.567197e+06,4.703030e+06,102100,3857,161525100,41265855195,23007991,2023-01-28 22:38:00,13017352,875.12,...,,38.869768,-76.959913,,,0.0,0.0,0.0,0.0,302271.0


In [15]:
# Simply subtracting 7 days misses entries which occurred a week ago but earlier in the day from the current time.
# Subtracting 8 days includes these but also some dates from the day before.
# We start with 8 and then pare down to just the past 7 days.
# x = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8") 

In [None]:
# # Get miliseconds from 1970-01-01 to 12AM today
# midnight = datetime.combine(date.today(), datetime.min.time())
# t1 = (midnight - datetime(1970,1,1)).total_seconds() * 1000

# # Subtract a week in miliseconds from this figure 
# week = timedelta(days=7).total_seconds() * 1000 
# t2 = t1-week

# # Check against REPORTDATE in x.features
# incidents = [{'geometry':f.geometry, 'attributes':f.attributes} 
#       for f in x.features if f.attributes['REPORTDATE'] >= t2] 

# # Push via XCOM: (sys.getsizeof(incidents) = 1.68 KB, Feb 1 2023)
# #--------------------------------
# # Read array from XCOM 
# # Convert to Dataframe
# df = pd.json_normalize(incidents)
# df.columns = [col.split('.')[1] for col in df] # remove column prefixes 
# df['REPORTDATE'] = df['REPORTDATE'].map(lambda v: datetime(1970,1,1) + timedelta(seconds=v/1000))
# # Push to BIGQUERY

# len(incidents)


188

In [None]:
# def queryLastDays(days:int):
#   query = crashes_fl.query(where=f"REPORTDATE >= CURRENT_TIMESTAMP - {days}", out_fields="CCN")
#   ccn = [f.attributes['CCN'] for f in query.features]

#   records = dc_df2[dc_df2['CCN'].isin(ccn)]
#   return records[['REPORTDATE','CCN']].sort_values('REPORTDATE',ascending=True)

# last7 = queryLastDays(7)
# last8 = queryLastDays(8)
# last9 = queryLastDays(9)
# last11 = queryLastDays(11)
# x = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8", out_fields="CCN")
# ccn = [f.attributes['CCN'] for f in x.features]
# print(len(x.features))

In [None]:
import sys 

sys.getsizeof(incidents)


1680

In [None]:
datetime(1970,1,1) + timedelta(seconds=1748062800000/1000)

datetime.datetime(2025, 5, 24, 5, 0)

In [None]:
import airflow