## Data Acquisition

First I will use the `sodapy` package to set up a data stream using the Open Data API. This data is updated monthly, and so by using an API, the data will always be up-to-date.

In [2]:
import pandas as pd
import datetime
import json

In [4]:
summons_raw = pd.read_csv("~/data608_final/docs/scratch/NYPD_Criminal_Court_Summons__Historic_.csv")

In [5]:
summons_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5280675 entries, 0 to 5280674
Data columns (total 17 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   SUMMONS_KEY            int64  
 1   SUMMONS_DATE           object 
 2   OFFENSE_DESCRIPTION    object 
 3   LAW_SECTION_NUMBER     object 
 4   LAW_DESCRIPTION        object 
 5   SUMMONS_CATEGORY_TYPE  object 
 6   AGE_GROUP              object 
 7   SEX                    object 
 8   RACE                   object 
 9   JURISDICTION_CODE      int64  
 10  BORO                   object 
 11  PRECINCT_OF_OCCUR      int64  
 12  X_COORDINATE_CD        float64
 13  Y_COORDINATE_CD        float64
 14  Latitude               float64
 15  Longitude              float64
 16  Lon_Lat                object 
dtypes: float64(4), int64(3), object(10)
memory usage: 684.9+ MB


In [31]:
df = summons_raw.assign(
    SUMMONS_DATE = pd.to_datetime(summons_raw['SUMMONS_DATE'], infer_datetime_format=True)
).fillna('')

cond = df['OFFENSE_DESCRIPTION'].str.contains("MARIJUANA")

In [32]:
mj = df[cond].assign(
        MONTH = df['SUMMONS_DATE'].dt.month
)
mj.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 180724 entries, 32 to 5280641
Data columns (total 18 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   SUMMONS_KEY            180724 non-null  int64         
 1   SUMMONS_DATE           180724 non-null  datetime64[ns]
 2   OFFENSE_DESCRIPTION    180724 non-null  object        
 3   LAW_SECTION_NUMBER     180724 non-null  object        
 4   LAW_DESCRIPTION        180724 non-null  object        
 5   SUMMONS_CATEGORY_TYPE  180724 non-null  object        
 6   AGE_GROUP              180724 non-null  object        
 7   SEX                    180724 non-null  object        
 8   RACE                   180724 non-null  object        
 9   JURISDICTION_CODE      180724 non-null  int64         
 10  BORO                   180724 non-null  object        
 11  PRECINCT_OF_OCCUR      180724 non-null  int64         
 12  X_COORDINATE_CD        180724 non-null  ob

In [28]:
#columns of interest
cols = ['OFFENSE_DESCRIPTION','PRECINCT_OF_OCCUR','Latitude', 'Longitude']

Because race is primarily coded as 'UNKNOWN' prior to 2017, and to make it a little easier to read the map, I will limit my analysis to just the last year, 2019.

In [33]:
mj_summons_2019 = mj[mj['SUMMONS_DATE'].dt.year == 2019]
mj_summons_2019.head(3)

Unnamed: 0,SUMMONS_KEY,SUMMONS_DATE,OFFENSE_DESCRIPTION,LAW_SECTION_NUMBER,LAW_DESCRIPTION,SUMMONS_CATEGORY_TYPE,AGE_GROUP,SEX,RACE,JURISDICTION_CODE,BORO,PRECINCT_OF_OCCUR,X_COORDINATE_CD,Y_COORDINATE_CD,Latitude,Longitude,Lon_Lat,MONTH
1687966,194866792,2019-03-18,"MARIJUANA, POSSESSION OF",221.05,Penal Law,MARIJUANA,18-24,M,BLACK,2,BRONX,40,1007170.0,235889,40.8141,-73.9172,POINT (-73.91718536099995 40.81411505800003),3
1687967,197686829,2019-05-27,"MARIJUANA, POSSESSION OF",221.05,Penal Law,MARIJUANA,25-44,M,BLACK,0,BROOKLYN,60,990784.0,149362,40.5766,-73.9765,POINT (-73.97648039599994 40.57664597800005),5
1687977,191935327,2019-01-06,"MARIJUANA, POSSESSION OF",221.05,Penal Law,MARIJUANA,25-44,M,BLACK,0,BRONX,46,1011830.0,252545,40.8598,-73.9003,POINT (-73.90030729499995 40.85981763200005),1


In [53]:
mj_by_precinct_2019 = mj_summons_2019[['PRECINCT_OF_OCCUR', 'RACE', 'SUMMONS_KEY']].groupby(['RACE']).count()

In [54]:
mj_by_precinct_2019.sort_values('SUMMONS_KEY', ascending=False)

Unnamed: 0_level_0,PRECINCT_OF_OCCUR,SUMMONS_KEY
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1
BLACK,8315,8315
WHITE HISPANIC,3541,3541
BLACK HISPANIC,1559,1559
WHITE,888,888
ASIAN / PACIFIC ISLANDER,382,382
UNKNOWN,183,183
AMERICAN INDIAN/ALASKAN NATIVE,60,60
OTHER,23,23


In [None]:
\\\\\\\\\\\\\\\\\\\\\\\

In [43]:
mj_summons_2019['MONTH'].unique()

array([ 3,  5,  1,  2,  8,  7,  4,  6, 10,  9, 11, 12])

In [120]:
#https://gis.stackexchange.com/questions/220997/pandas-to-geojson-multiples-points-features-with-python <-found here
#https://geoffboeing.com/2015/10/exporting-python-data-geojson/ <- adapted from here
def df_to_geojson(df, properties, lat='Latitude', lon='Longitude'):
    geojson = {'type':'FeatureCollection', 'features':[]}
    for _, row in df.iterrows():
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point','coordinates':[]}}
        feature['geometry']['coordinates'] = [row[lon],row[lat],0]
        for prop in properties:
            feature['properties'][prop] = row[prop]
        geojson['features'].append(feature)
    return geojson

In [121]:
cols = ['MONTH', 'RACE', 'AGE_GROUP', 'SEX']




In [122]:
with open('scratch/summons_mj.geojson', 'w') as outfile:
    json.dump(df_to_geojson(mj_summons_2019, cols), outfile)
    