In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [42]:
# Note: the data is updated quarterly. Last update was 10-23-2024
df = pd.read_csv('./NYPD_Arrest_Data__Year_to_Date__20241023.csv')
df.head(10)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
1,282884120,02/27/2024,263.0,"ARSON 2,3,4",114.0,ARSON,PL 1501001,F,Q,107,71,45-64,M,WHITE,1037489,206343,40.732881,-73.807899,POINT (-73.807899 40.732881)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,F,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,100,0,25-44,M,BLACK,1039777,155013,40.59198,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,F,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)
6,291269261,08/07/2024,157.0,RAPE 1,104.0,RAPE,PL 1303504,F,K,84,0,25-44,M,WHITE HISPANIC,988902,192641,40.695439,-73.983225,POINT (-73.9832253756043 40.6954388081238)
7,280286274,01/10/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,K,70,0,25-44,M,BLACK,993690,172242,40.639436,-73.965983,POINT (-73.965983 40.639436)
8,281035905,01/24/2024,777.0,(null),,(null),PL 1950200,F,K,67,0,45-64,F,WHITE,997897,175676,40.648859,-73.95082,POINT (-73.95082 40.648859)
9,279805425,01/02/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,Q,100,0,25-44,M,BLACK,1035353,152906,40.586222,-73.816011,POINT (-73.816011 40.586222)


In [43]:
_ARREST_KEY = "ARREST_KEY"

_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

_ARREST_DATE = "ARREST_DATE"
_ARREST_MONTH = "ARREST_MONTH" 

_ARREST_PRECINCT = "ARREST_PRECINCT"

_PERPETRATOR_RACE = "PERP_RACE"
_PERPETRATOR_SEX = "PERP_SEX"
_PERPETRATOR_AGE_GROUP="AGE_GROUP"

_INTERNAL_CLASSIFICATION="PD_CD"
_INTERNAL_CLASSIFICATION_DESCRIPTION="PD_DESC"
_THREE_DIGIT_INTERNAL_CLASSIFICATION="KY_CD"

_GEOGRAPHICAL_POSITION = "New Georeferenced Column"

df.shape[1]

19

In [44]:
# Predict Level of Offense 
# Could also consider using Precinct. Precinct roughly corresponds to neighborhood


# Ensure that offense level is either misdemeanor, felony, or violation
df = df.loc[(df[_OFFENSE_LEVEL] == 'F') | (df[_OFFENSE_LEVEL] == 'M') | (df[_OFFENSE_LEVEL] == 'V')]
print((df[_OFFENSE_LEVEL] == 'F') .any())

# Limit jurisdictions to patrol, transit, and housing (others are non NYPD)
df = df.loc[(df[_JURISDICTION_CODE] ==0) | (df[_JURISDICTION_CODE] == 1) | (df[_JURISDICTION_CODE] == 2)]

# Ensure all borough is either Queens, Brooklyn, Manhattan, Bronx, or Staten Island
df = df.loc[(df[_ARREST_BORO] == 'Q') | (df[_ARREST_BORO] == 'K') | (df[_ARREST_BORO] == 'M')| (df[_ARREST_BORO] == 'B') | (df[_ARREST_BORO] == 'S')]


True


In [45]:

# Drop perpetrator sex/race/age columns
df = df.drop(columns=[_PERPETRATOR_SEX,_PERPETRATOR_RACE,_PERPETRATOR_AGE_GROUP], axis=1)

# Drop various internal classification codes/descriptions
df=df.drop([_INTERNAL_CLASSIFICATION,_THREE_DIGIT_INTERNAL_CLASSIFICATION,_INTERNAL_CLASSIFICATION_DESCRIPTION],axis=1)

# Drop the "New georeferenced column". It contains latitude/longitude which already exist as separate columns
df=df.drop([_GEOGRAPHICAL_POSITION],axis=1)

# Remove the first column from the data frame (it's just an identifier)
df = df.drop(_ARREST_KEY, axis=1)



In [46]:
# Strip ARREST_DATE to just the month
df[_ARREST_DATE] = pd.to_datetime(df[_ARREST_DATE])

df[_ARREST_MONTH] = df[_ARREST_DATE].dt.month

# Drop the original ARREST_DATE column 
df = df.drop(columns=[_ARREST_DATE])

df[_ARREST_MONTH].head()

0    1
2    3
3    5
4    6
5    1
Name: ARREST_MONTH, dtype: int32

In [47]:
# Drop the target column from the data frame
attr = df.drop(_OFFENSE_LEVEL, axis=1)
target = df[_OFFENSE_LEVEL]

print(attr.shape[1])
attr.head()


10


Unnamed: 0,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,ARREST_MONTH
0,FELONY ASSAULT,PL 1211200,Q,105,0,1057545,207911,40.737043,-73.735514,1
2,FELONY ASSAULT,PL 1200502,B,48,0,1013900,250835,40.855109,-73.892818,3
3,FELONY ASSAULT,PL 1200512,S,121,0,938928,168468,40.628967,-74.163275,5
4,FELONY ASSAULT,PL 1211200,Q,100,0,1039777,155013,40.59198,-73.800066,6
5,RAPE,PL 1302503,M,14,0,985764,213806,40.753533,-73.994537,1


In [48]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')
df_num.info()
df_obj.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188701 entries, 0 to 195446
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ARREST_PRECINCT    188701 non-null  int64  
 1   JURISDICTION_CODE  188701 non-null  int64  
 2   X_COORD_CD         188701 non-null  int64  
 3   Y_COORD_CD         188701 non-null  int64  
 4   Latitude           188701 non-null  float64
 5   Longitude          188701 non-null  float64
 6   ARREST_MONTH       188701 non-null  int32  
dtypes: float64(2), int32(1), int64(4)
memory usage: 10.8 MB
<class 'pandas.core.frame.DataFrame'>
Index: 188701 entries, 0 to 195446
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   OFNS_DESC    188701 non-null  object
 1   LAW_CODE     188701 non-null  object
 2   LAW_CAT_CD   188701 non-null  object
 3   ARREST_BORO  188701 non-null  object
dtypes: object(4)
memory usage: 7.2+

In [49]:

df_obj= pd.get_dummies(df_obj, drop_first=True)
df_obj

Unnamed: 0,OFNS_DESC_ADMINISTRATIVE CODE,OFNS_DESC_ADMINISTRATIVE CODES,OFNS_DESC_ALCOHOLIC BEVERAGE CONTROL LAW,OFNS_DESC_ANTICIPATORY OFFENSES,OFNS_DESC_ARSON,OFNS_DESC_ASSAULT 3 & RELATED OFFENSES,OFNS_DESC_BURGLAR'S TOOLS,OFNS_DESC_BURGLARY,OFNS_DESC_CANNABIS RELATED OFFENSES,OFNS_DESC_CHILD ABANDONMENT/NON SUPPORT 1,...,LAW_CODE_VTL119807A,LAW_CODE_VTL119809D,LAW_CODE_VTL1212000,LAW_CODE_VTL21300A1,LAW_CAT_CD_M,LAW_CAT_CD_V,ARREST_BORO_K,ARREST_BORO_M,ARREST_BORO_Q,ARREST_BORO_S
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195442,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,False
195443,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
195444,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,True,False,False
195445,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False


In [50]:

print(df[_OFFENSE_LEVEL].value_counts())
print( (df[_OFFENSE_LEVEL].value_counts()/ df[_OFFENSE_LEVEL].value_counts().sum()) * 100)


LAW_CAT_CD
M    106243
F     80720
V      1738
Name: count, dtype: int64
LAW_CAT_CD
M    56.302298
F    42.776668
V     0.921034
Name: count, dtype: float64


In [51]:
# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.2, random_state=5)

In [52]:
attr

Unnamed: 0,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,ARREST_MONTH
0,FELONY ASSAULT,PL 1211200,Q,105,0,1057545,207911,40.737043,-73.735514,1
2,FELONY ASSAULT,PL 1200502,B,48,0,1013900,250835,40.855109,-73.892818,3
3,FELONY ASSAULT,PL 1200512,S,121,0,938928,168468,40.628967,-74.163275,5
4,FELONY ASSAULT,PL 1211200,Q,100,0,1039777,155013,40.591980,-73.800066,6
5,RAPE,PL 1302503,M,14,0,985764,213806,40.753533,-73.994537,1
...,...,...,...,...,...,...,...,...,...,...
195442,PETIT LARCENY,PL 1552500,Q,109,0,1030206,215367,40.757691,-73.834115,9
195443,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,34,0,1004094,251444,40.856808,-73.928265,9
195444,BURGLARY,PL 1402501,M,23,0,1000141,225685,40.786116,-73.942614,9
195445,OTHER OFFENSES RELATED TO THEFT,PL 1651503,B,44,1,1004749,240880,40.827812,-73.925929,9
