In [101]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [102]:
# Note: the data is updated quarterly. Last update was 10-23-2024
df = pd.read_csv('./NYPD_Arrest_Data__Year_to_Date__20241023.csv')
df.head(10)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
1,282884120,02/27/2024,263.0,"ARSON 2,3,4",114.0,ARSON,PL 1501001,F,Q,107,71,45-64,M,WHITE,1037489,206343,40.732881,-73.807899,POINT (-73.807899 40.732881)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,F,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,100,0,25-44,M,BLACK,1039777,155013,40.59198,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,F,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)
6,291269261,08/07/2024,157.0,RAPE 1,104.0,RAPE,PL 1303504,F,K,84,0,25-44,M,WHITE HISPANIC,988902,192641,40.695439,-73.983225,POINT (-73.9832253756043 40.6954388081238)
7,280286274,01/10/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,K,70,0,25-44,M,BLACK,993690,172242,40.639436,-73.965983,POINT (-73.965983 40.639436)
8,281035905,01/24/2024,777.0,(null),,(null),PL 1950200,F,K,67,0,45-64,F,WHITE,997897,175676,40.648859,-73.95082,POINT (-73.95082 40.648859)
9,279805425,01/02/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,Q,100,0,25-44,M,BLACK,1035353,152906,40.586222,-73.816011,POINT (-73.816011 40.586222)


In [103]:
# Predict Level of Offense based on the Borough and Jurisdiction
# Could also consider using Precinct. Precinct roughly corresponds to neighborhood
_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

# Ensure that offense level is either misdemeanor, felony, or violation
df = df.loc[(df[_OFFENSE_LEVEL] == 'F') | (df[_OFFENSE_LEVEL] == 'M') | (df[_OFFENSE_LEVEL] == 'V')]
print((df[_OFFENSE_LEVEL] == 'F') .any())

# Limit jurisdictions to patrol, transit, and housing (others are non NYPD)
df = df.loc[(df[_JURISDICTION_CODE] ==0) | (df[_JURISDICTION_CODE] == 1) | (df[_JURISDICTION_CODE] == 2)]

# Ensure all borough is either Queens, Brooklyn, Manhattan, Bronx, or Staten Island
df = df.loc[(df[_ARREST_BORO] == 'Q') | (df[_ARREST_BORO] == 'K') | (df[_ARREST_BORO] == 'M')| (df[_ARREST_BORO] == 'B') | (df[_ARREST_BORO] == 'S')]

# Remove the first column from the data frame (it's just an identifier)
attr = df.drop("ARREST_KEY", axis=1)

# Drop the target column from the data frame
attr = df.drop(_OFFENSE_LEVEL, axis=1)
target = df[_OFFENSE_LEVEL]


attr.head()

True


Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,100,0,25-44,M,BLACK,1039777,155013,40.59198,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)


In [104]:
# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.2, random_state=5)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Identify the numerical columns
print(df.select_dtypes(include=int))

# Be careful - ARREST_PRECINCT and JURISDICTION code are numerical, but are categorical (not continuous)
categorical_columns = []
categorical_columns.append(['ARREST_PRECINCT', _JURISDICTION_CODE])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)


        ARREST_KEY  ARREST_PRECINCT  JURISDICTION_CODE  X_COORD_CD  Y_COORD_CD
0        281240883              105                  0     1057545      207911
2        283137868               48                  0     1013900      250835
3        287001362              121                  0      938928      168468
4        287829614              100                  0     1039777      155013
5        280513565               14                  0      985764      213806
...            ...              ...                ...         ...         ...
195442   293968561              109                  0     1030206      215367
195443   292591949               34                  0     1004094      251444
195444   294028627               23                  0     1000141      225685
195445   292933189               44                  1     1004749      240880
195446   293706408               46                  0     1011399      250642

[188701 rows x 5 columns]
