In [4]:
import pandas as pd
import os 
from datetime import date, datetime, timedelta
import json
import yaml
from arcgis.features import FeatureLayer, GeoAccessor, GeoSeriesAccessor, Table

In [5]:
with open ("./data/sources.yml", "r") as yaml_file:
  data = yaml.safe_load(yaml_file)

crashes_fl = FeatureLayer(data['crashes']['mapserver'])
crash_details_fl = FeatureLayer(data['crash_details']['mapserver'])

In [6]:
for f in crashes_fl.properties.fields:
  print(f)

{
  "name": "OBJECTID",
  "type": "esriFieldTypeOID",
  "alias": "OBJECTID",
  "domain": null
}
{
  "name": "CRIMEID",
  "type": "esriFieldTypeString",
  "alias": "CRASHID",
  "length": 25,
  "domain": null
}
{
  "name": "CCN",
  "type": "esriFieldTypeString",
  "alias": "CCN",
  "length": 8,
  "domain": null
}
{
  "name": "REPORTDATE",
  "type": "esriFieldTypeDate",
  "alias": "REPORTDATE",
  "length": 8,
  "domain": null
}
{
  "name": "ROUTEID",
  "type": "esriFieldTypeString",
  "alias": "RouteID",
  "length": 255,
  "domain": null
}
{
  "name": "MEASURE",
  "type": "esriFieldTypeDouble",
  "alias": "Measure",
  "domain": null
}
{
  "name": "OFFSET",
  "type": "esriFieldTypeDouble",
  "alias": "OFFSET",
  "domain": null
}
{
  "name": "STREETSEGID",
  "type": "esriFieldTypeDouble",
  "alias": "STREETSEGID",
  "domain": null
}
{
  "name": "ROADWAYSEGID",
  "type": "esriFieldTypeDouble",
  "alias": "ROADWAYSEGID",
  "domain": null
}
{
  "name": "FROMDATE",
  "type": "esriFieldTypeDate"

The most frequent update interval for our datasets is daily. Most are updated weekly or irregularly, but we will schedule our DAG to check for daily updates. 

In [7]:
# The query syntax in the REST API for ARCGIS is inflexible -- we will work around the function we know works. 
# Simply subtracting a day misses entries which occurred yesterday but earlier in the day than the current time.
# Subtracting 2 days includes these but also some dates from the day before.
# We start with 2 and then pare down to just yesterday.
result = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8") 

In [8]:
from time_utils import ms, midnight

# Get midnight today and midnight yesterday in miliseconds from 1970-01-01
t1 = ms(midnight(date.today()-timedelta(days=7)))
t2 = ms(midnight(date.today()))

# Check against REPORTDATE in x.features
incidents = [{'geometry':f.geometry, 'attributes':f.attributes} 
      for f in result.features if t1 < f.attributes['REPORTDATE'] < t2] 

# Push to XCOM
# ----------------------------------------
# Read array from XCOM 

# Convert to Dataframe
df = pd.json_normalize(incidents)
df.columns = [col.split('.')[1] for col in df] # remove column prefixes 
df['REPORTDATE'] = df['REPORTDATE'].map(lambda v: datetime(1970,1,1) + timedelta(seconds=v/1000))
# Push to BIGQUERY
df.sort_values('REPORTDATE')

Unnamed: 0,x,y,spatialReference,spatialReference.1,OBJECTID,CRIMEID,CCN,REPORTDATE,ROUTEID,MEASURE,...,LASTUPDATEDATE,MPDLATITUDE,MPDLONGITUDE,MPDGEOX,MPDGEOY,FATALPASSENGER,MAJORINJURIESPASSENGER,MINORINJURIESPASSENGER,UNKNOWNINJURIESPASSENGER,MAR_ID
64,-8.581171e+06,4.712834e+06,102100,3857,161806928,41181819497,23013418,2023-01-26 00:07:00,11090702,158.38,...,1.674920e+12,38.937939,-77.085888,,,0.0,0.0,0.0,0.0,310387.0
65,-8.568359e+06,4.703688e+06,102100,3857,161806929,41181924420,23013412,2023-01-26 00:25:00,13054992,178.63,...,,38.874473,-76.970767,,,0.0,0.0,0.0,0.0,150398.0
66,-8.575450e+06,4.707871e+06,102100,3857,161806930,41181955855,23013414,2023-01-26 00:30:00,11001502,1710.00,...,,38.903578,-77.034129,,,0.0,0.0,0.0,0.0,218272.0
67,-8.576362e+06,4.707720e+06,102100,3857,161806931,41181962150,23013434,2023-01-26 00:31:00,11050892,2921.29,...,1.674920e+12,38.902244,-77.042765,,,0.0,0.0,0.0,0.0,302221.0
68,-8.575733e+06,4.710128e+06,102100,3857,161806932,41182022587,01003447,2023-01-26 00:42:00,11014452A,36.43,...,,38.919205,-77.036913,,,0.0,0.0,0.0,0.0,226216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,-8.579930e+06,4.716461e+06,102100,3857,161807016,41264853605,23014998,2023-01-28 18:36:00,11025152,7620.69,...,,38.963494,-77.075125,,,0.0,0.0,0.0,0.0,299971.0
122,-8.568453e+06,4.703635e+06,102100,3857,161807017,41265177803,23015063,2023-01-28 20:00:00,13069812,3150.92,...,,38.874249,-76.971515,,,0.0,0.0,0.0,0.0,295223.0
123,-8.565659e+06,4.707112e+06,102100,3857,161807018,41265308370,23015078,2023-01-28 20:31:00,12061162,1103.52,...,,38.898199,-76.946300,,,0.0,0.0,0.0,0.0,26428.0
124,-8.567197e+06,4.703030e+06,102100,3857,161807019,41265855195,23007991,2023-01-28 22:38:00,13017352,875.12,...,,38.869768,-76.959913,,,0.0,0.0,0.0,0.0,302271.0


In [9]:
# Simply subtracting 7 days misses entries which occurred a week ago but earlier in the day from the current time.
# Subtracting 8 days includes these but also some dates from the day before.
# We start with 8 and then pare down to just the past 7 days.
# x = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8") 

In [10]:
# # Get miliseconds from 1970-01-01 to 12AM today
# midnight = datetime.combine(date.today(), datetime.min.time())
# t1 = (midnight - datetime(1970,1,1)).total_seconds() * 1000

# # Subtract a week in miliseconds from this figure 
# week = timedelta(days=7).total_seconds() * 1000 
# t2 = t1-week

# # Check against REPORTDATE in x.features
# incidents = [{'geometry':f.geometry, 'attributes':f.attributes} 
#       for f in x.features if f.attributes['REPORTDATE'] >= t2] 

# # Push via XCOM: (sys.getsizeof(incidents) = 1.68 KB, Feb 1 2023)
# #--------------------------------
# # Read array from XCOM 
# # Convert to Dataframe
# df = pd.json_normalize(incidents)
# df.columns = [col.split('.')[1] for col in df] # remove column prefixes 
# df['REPORTDATE'] = df['REPORTDATE'].map(lambda v: datetime(1970,1,1) + timedelta(seconds=v/1000))
# # Push to BIGQUERY

# len(incidents)


In [11]:
# def queryLastDays(days:int):
#   query = crashes_fl.query(where=f"REPORTDATE >= CURRENT_TIMESTAMP - {days}", out_fields="CCN")
#   ccn = [f.attributes['CCN'] for f in query.features]

#   records = dc_df2[dc_df2['CCN'].isin(ccn)]
#   return records[['REPORTDATE','CCN']].sort_values('REPORTDATE',ascending=True)

# last7 = queryLastDays(7)
# last8 = queryLastDays(8)
# last9 = queryLastDays(9)
# last11 = queryLastDays(11)
# x = crashes_fl.query(where="REPORTDATE >= CURRENT_TIMESTAMP - 8", out_fields="CCN")
# ccn = [f.attributes['CCN'] for f in x.features]
# print(len(x.features))

In [12]:
import sys 

sys.getsizeof(incidents)


1080

In [13]:
datetime(1970,1,1) + timedelta(seconds=1748062800000/1000)

datetime.datetime(2025, 5, 24, 5, 0)

In [14]:
import airflow.providers

ModuleNotFoundError: No module named 'airflow'