In [161]:
import pandas as pd
import geopandas as gpd
import fiona
from shapely.geometry import Point, Polygon
from typing import Union, Tuple
from util_detroit import first_in_range_camera, csv_with_x_y_to_gpd

import re,os
import numpy as np
import matplotlib.pyplot as plt

import kml2geojson
import json

pd.options.display.max_columns = None

### S3 data pull

In [154]:
bucket = "s3://secondmeasure-dev-at-foundational-data/hakso/detroit/"
greenlight_fn = f"Project_Green_Light_Locations.geojson"
fn_911 = f"911_Calls_For_Service.csv"
fullpath_911 = f"{bucket}{fn_911}"
fullpath_greenlight = f"{bucket}{greenlight_fn}"
nrows = 1e5
calls = (
    csv_with_x_y_to_gpd(
        #     fullpath,#from s3
        fn_911,  # local
        read_csv_args={
            "nrows": nrows,
            "usecols": [
                "X",
                "Y",
                "incident_id",
                "calldescription",
                "category",
                "call_timestamp",
            ],
        },
    )
    .astype({"call_timestamp": "datetime64"})
    .assign(call_day=lambda df: df.call_timestamp.round("D"))
)

In [98]:
greenlight_geo_df = (
    gpd.read_file(greenlight_fn)
    .astype({"live_date": "datetime64"})
    .assign(live_day=lambda df: df.live_date.round("D"))
)

#### Distance calculation

In [21]:
print(calls.columns)
greenlight_geo_df.columns

Index(['X', 'Y', 'incident_id', 'calldescription', 'category',
       'call_timestamp', 'geometry'],
      dtype='object')


Index(['address', 'business_name', 'business_type', 'precinct', 'live_date',
       'ObjectId', 'geometry'],
      dtype='object')

In [133]:
from util_detroit import nearest_neighbor

In [None]:
# to do: solve the problem of filtering to only live cameras by selected the top N matches, storing the index and the values in an array in the columns that 
# currently hold the scalars, then write a function to big the shortcut distance for active green lights


# to do: rather than getting closest camera then asking if it's live, instead get all cameras within threshold distance, and ask when the first one went live.

In [None]:
gre

In [None]:
# greenlight_geo_df = nearest_neighbor(greenlight_geo_df, greenlight_geo_df, 2, True)
PROXIMITY_THRESHOLD = 50
combined_df = first_in_range_camera(
    calls.loc[
        :, ["geometry", "calldescription", "category", "call_timestamp", "call_day"]
    ],
    greenlight_geo_df.loc[:, ["live_date", "live_day", "geometry","address"]],
    50,
).dropna(subset=['date_first_live_camera']).assign(
    days_since_live=lambda df: (df.call_day - df.date_first_live_camera).dt.days,
)

In [162]:
date_range = pd.date_range(
    min(combined_df.call_day.min(), combined_df.date_first_live_camera.min()),
    max(combined_df.call_day.max(), combined_df.date_first_live_camera.max()),
)
#     within_proximity_of_greenlight=lambda df: df.meters_to_nearest_match
#     <= PROXIMITY_THRESHOLD,
#     live_at_time_of_call=lambda df: df.live_date_neighbor <= df.call_timestamp,
#     close_to_live_light=lambda df: df.live_at_time_of_call
#     & df.within_proximity_of_greenlight,
# )

In [181]:
background_rate = combined_df.groupby('call_day').category.count().to_frame('total_calls').reindex(index=date_range).fillna(0)

In [191]:
background_rate.head(3)

Unnamed: 0_level_0,total_calls
call_day,Unnamed: 1_level_1
2016-09-20,3
2016-09-21,74
2016-09-22,113


In [202]:
live_cameras = [(greenlight_geo_df.live_day <= x).sum() for x in date_range]
live_cameras = pd.Series(index=date_range,data=live_cameras,name='live_cameras')
live_cameras.index.name = 'call_day'
live_cameras.head(3)

call_day
2016-01-01    8
2016-01-02    8
2016-01-03    8
Freq: D, Name: live_cameras, dtype: int64

to do: assume live date is not correlated with crime in proximity of the camera, calculate average proportion of total crime by project greenlight site.

In [214]:
dff

Unnamed: 0_level_0,days_since_live,ncalls,total_calls,live_cameras,proportion_of_calls_per_camera
call_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-09-20,-617,1,3,36,0.009259
2016-09-20,-147,2,3,36,0.018519
2016-09-21,-1688,1,74,36,0.000375
2016-09-21,-1547,1,74,36,0.000375
2016-09-21,-1525,1,74,36,0.000375
...,...,...,...,...,...
2016-12-11,202,2,69,66,0.000439
2016-12-11,215,1,69,66,0.000220
2016-12-11,230,3,69,66,0.000659
2016-12-11,283,2,69,66,0.000439


In [221]:
dff = combined_df.groupby(["call_day", "days_since_live"]).calldescription.count().to_frame(
    "ncalls"
).join(background_rate).join(live_cameras).assign(
    proportion_of_calls_per_camera=lambda df: df.ncalls
    / df.total_calls
    / df.live_cameras
).reset_index('days_since_live')
idx = dff.days_since_live < 0
dff.loc[idx ,'proportion_of_calls_per_site'] = dff.loc[idx,'ncalls'] / dff.loc[idx,'total_calls']/(dff.live_cameras.max()-dff.loc[idx,'live_cameras'])

In [222]:
dff

Unnamed: 0_level_0,days_since_live,ncalls,total_calls,live_cameras,proportion_of_calls_per_camera,proportion_of_calls_per_site
call_day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-09-20,-617,1,3,36,0.009259,0.011111
2016-09-20,-147,2,3,36,0.018519,0.022222
2016-09-21,-1688,1,74,36,0.000375,0.000450
2016-09-21,-1547,1,74,36,0.000375,0.000450
2016-09-21,-1525,1,74,36,0.000375,0.000450
...,...,...,...,...,...,...
2016-12-11,202,2,69,66,0.000439,
2016-12-11,215,1,69,66,0.000220,
2016-12-11,230,3,69,66,0.000659,
2016-12-11,283,2,69,66,0.000439,


In [218]:
combined_df.loc[lambda df: df.within_proximity_of_greenlight, :].groupby(
    ["live_at_time_of_call", "call_day"]
).category.count().to_frame("grouped_count").join(
    background_rate, on="call_day"
).assign(
    proportion_of_calls=lambda df: df.grouped_count / df.total_calls
).join(
    combined_df.loc[lambda df: df.within_proximity_of_greenlight, :]
    .groupby(["address_neighbor", "live_at_time_of_call"])
    .category.count()
    .to_frame("n_sites")
).assign(proportion_of_calls_per_site=lambda df: df.proportion_of_calls/df.n_sites)

AttributeError: 'GeoDataFrame' object has no attribute 'within_proximity_of_greenlight'

In [73]:
background_rate

call_day
2016-09-20      67
2016-09-21    1210
2016-09-22    1185
2016-09-23    1167
2016-09-24    1241
              ... 
2016-12-07    1410
2016-12-08    1611
2016-12-09    1474
2016-12-10    1611
2016-12-11     850
Name: incident_id, Length: 83, dtype: int64

## Approximating the relationship between lat/long distance and meters using google maps in detroit


In [15]:
a = (42.450528328751126, -82.94340486981733)
b = (42.440174033911795, -83.27726268725615)
d = 27500

In [18]:
27500/(((a[1]-b[1])**2)-((a[0]-b[0])**2))**.5

82410.03766311899