In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split


In [4]:
# Note: the data is updated quarterly. Last update was 10-23-2024
df = pd.read_csv('./NYPD_Arrest_Data__Year_to_Date__20241023.csv')
df.head(10)

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
1,282884120,02/27/2024,263.0,"ARSON 2,3,4",114.0,ARSON,PL 1501001,F,Q,107,71,45-64,M,WHITE,1037489,206343,40.732881,-73.807899,POINT (-73.807899 40.732881)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,F,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,Q,100,0,25-44,M,BLACK,1039777,155013,40.59198,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,F,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)
6,291269261,08/07/2024,157.0,RAPE 1,104.0,RAPE,PL 1303504,F,K,84,0,25-44,M,WHITE HISPANIC,988902,192641,40.695439,-73.983225,POINT (-73.9832253756043 40.6954388081238)
7,280286274,01/10/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,F,K,70,0,25-44,M,BLACK,993690,172242,40.639436,-73.965983,POINT (-73.965983 40.639436)
8,281035905,01/24/2024,777.0,(null),,(null),PL 1950200,F,K,67,0,45-64,F,WHITE,997897,175676,40.648859,-73.95082,POINT (-73.95082 40.648859)
9,279805425,01/02/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,F,Q,100,0,25-44,M,BLACK,1035353,152906,40.586222,-73.816011,POINT (-73.816011 40.586222)


In [5]:
# Predict Level of Offense based on the Borough and Jurisdiction
# Could also consider using Precinct. Precinct roughly corresponds to neighborhood
_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

# Ensure that offense level is either misdemeanor, felony, or violation
df = df.loc[(df[_OFFENSE_LEVEL] == 'F') | (df[_OFFENSE_LEVEL] == 'M') | (df[_OFFENSE_LEVEL] == 'V')]
print((df[_OFFENSE_LEVEL] == 'F') .any())

# Limit jurisdictions to patrol, transit, and housing (others are non NYPD)
df = df.loc[(df[_JURISDICTION_CODE] ==0) | (df[_JURISDICTION_CODE] == 1) | (df[_JURISDICTION_CODE] == 2)]

# Ensure all borough is either Queens, Brooklyn, Manhattan, Bronx, or Staten Island
df = df.loc[(df[_ARREST_BORO] == 'Q') | (df[_ARREST_BORO] == 'K') | (df[_ARREST_BORO] == 'M')| (df[_ARREST_BORO] == 'B') | (df[_ARREST_BORO] == 'S')]

# Remove the first column from the data frame (it's just an identifier)
attr = df.drop("ARREST_KEY", axis=1)

# Drop the target column from the data frame
attr = df.drop(_OFFENSE_LEVEL, axis=1)
target = df[_OFFENSE_LEVEL]


attr.head()

True


Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,100,0,25-44,M,BLACK,1039777,155013,40.59198,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)


In [21]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')
df_num.info()
df_obj.info()

<class 'pandas.core.frame.DataFrame'>
Index: 188701 entries, 0 to 195446
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ARREST_KEY         188701 non-null  int64  
 1   PD_CD              188701 non-null  float64
 2   KY_CD              188682 non-null  float64
 3   ARREST_PRECINCT    188701 non-null  int64  
 4   JURISDICTION_CODE  188701 non-null  int64  
 5   X_COORD_CD         188701 non-null  int64  
 6   Y_COORD_CD         188701 non-null  int64  
 7   Latitude           188701 non-null  float64
 8   Longitude          188701 non-null  float64
dtypes: float64(4), int64(5)
memory usage: 14.4 MB
<class 'pandas.core.frame.DataFrame'>
Index: 188701 entries, 0 to 195446
Data columns (total 10 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   ARREST_DATE               188701 non-null  object
 1   PD_DESC                   188

In [22]:

df_obj= pd.get_dummies(df_obj, drop_first=True)
df_obj

Unnamed: 0,ARREST_DATE_01/02/2024,ARREST_DATE_01/03/2024,ARREST_DATE_01/04/2024,ARREST_DATE_01/05/2024,ARREST_DATE_01/06/2024,ARREST_DATE_01/07/2024,ARREST_DATE_01/08/2024,ARREST_DATE_01/09/2024,ARREST_DATE_01/10/2024,ARREST_DATE_01/11/2024,...,New Georeferenced Column_POINT (-74.247875 40.510096),New Georeferenced Column_POINT (-74.24827692615737 40.51051810347657),New Georeferenced Column_POINT (-74.249302 40.511577),New Georeferenced Column_POINT (-74.24935765721436 40.51005188431608),New Georeferenced Column_POINT (-74.2497549524458 40.5120382535219),New Georeferenced Column_POINT (-74.250331 40.513488),New Georeferenced Column_POINT (-74.25081 40.510118),New Georeferenced Column_POINT (-74.251844 40.504259),New Georeferenced Column_POINT (-74.2527114132383 40.51060174896178),New Georeferenced Column_POINT (0 0)
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195442,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
195443,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
195444,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
195445,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [23]:

print(df[_OFFENSE_LEVEL].value_counts())
print( (df[_OFFENSE_LEVEL].value_counts()/ df[_OFFENSE_LEVEL].value_counts().sum()) * 100)


LAW_CAT_CD
M    106243
F     80720
V      1738
Name: count, dtype: int64
LAW_CAT_CD
M    56.302298
F    42.776668
V     0.921034
Name: count, dtype: float64


In [6]:
# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.2, random_state=5)

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
# Identify the numerical columns
# print(df.select_dtypes(include=int))

# Be careful - ARREST_PRECINCT and JURISDICTION code are numerical, but are categorical (not continuous)
categorical_columns=['ARREST_PRECINCT', _JURISDICTION_CODE]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ]
)




In [26]:
attr

Unnamed: 0,ARREST_KEY,ARREST_DATE,PD_CD,PD_DESC,KY_CD,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,AGE_GROUP,PERP_SEX,PERP_RACE,X_COORD_CD,Y_COORD_CD,Latitude,Longitude,New Georeferenced Column
0,281240883,01/28/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,105,0,25-44,M,WHITE,1057545,207911,40.737043,-73.735514,POINT (-73.735514 40.737043)
2,283137868,03/03/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200502,B,48,0,25-44,M,BLACK,1013900,250835,40.855109,-73.892818,POINT (-73.892818 40.855109)
3,287001362,05/16/2024,109.0,"ASSAULT 2,1,UNCLASSIFIED",106.0,FELONY ASSAULT,PL 1200512,S,121,0,25-44,M,WHITE,938928,168468,40.628967,-74.163275,POINT (-74.163275 40.628967)
4,287829614,06/02/2024,105.0,STRANGULATION 1ST,106.0,FELONY ASSAULT,PL 1211200,Q,100,0,25-44,M,BLACK,1039777,155013,40.591980,-73.800066,POINT (-73.800066 40.59198)
5,280513565,01/14/2024,153.0,RAPE 3,104.0,RAPE,PL 1302503,M,14,0,18-24,M,BLACK,985764,213806,40.753533,-73.994537,POINT (-73.9945368920152 40.7535327012632)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195442,293968561,09/27/2024,339.0,"LARCENY,PETIT FROM OPEN AREAS,",341.0,PETIT LARCENY,PL 1552500,Q,109,0,18-24,M,WHITE HISPANIC,1030206,215367,40.757691,-73.834115,POINT (-73.834115 40.757691)
195443,292591949,09/01/2024,101.0,ASSAULT 3,344.0,ASSAULT 3 & RELATED OFFENSES,PL 1200001,M,34,0,25-44,M,WHITE HISPANIC,1004094,251444,40.856808,-73.928265,POINT (-73.92826482905474 40.85680819814297)
195444,294028627,09/29/2024,244.0,"BURGLARY,UNCLASSIFIED,UNKNOWN",107.0,BURGLARY,PL 1402501,M,23,0,45-64,M,WHITE,1000141,225685,40.786116,-73.942614,POINT (-73.942614 40.786116)
195445,292933189,09/08/2024,478.0,"THEFT OF SERVICES, UNCLASSIFIE",343.0,OTHER OFFENSES RELATED TO THEFT,PL 1651503,B,44,1,25-44,M,BLACK,1004749,240880,40.827812,-73.925929,POINT (-73.92592932593037 40.82781161940969)


In [14]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

knn = KNeighborsClassifier()

model = Pipeline(steps=[
    ('preprocessor',preprocessor),
    ('classifier',knn)
])

model.fit (attr_train,target_train)
target_pred = model.predict(attr_test)


In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluate the accuracy

#Compute and print the accuracy score.
accuracy = accuracy_score(target_test, target_pred)
print(f'Accuracy: {accuracy}')
print()

# Print the confusion matrix
print(confusion_matrix(target_test, target_pred))
print()

#Print the classification report
print('Classification Report')
print(classification_report(target_test, target_pred))

Accuracy: 0.5365252643014229

[[ 7134  8996     1]
 [ 8128 13107    31]
 [  136   200     8]]

Classification Report
              precision    recall  f1-score   support

           F       0.46      0.44      0.45     16131
           M       0.59      0.62      0.60     21266
           V       0.20      0.02      0.04       344

    accuracy                           0.54     37741
   macro avg       0.42      0.36      0.37     37741
weighted avg       0.53      0.54      0.53     37741



In [None]:
# Evaluate default accuracy using baseline model
from sklearn.dummy import DummyClassifier

baseline_model = DummyClassifier(strategy='most_frequent')
baseline_model.fit(attr_train, target_train)

y_pred = baseline_model.predict(attr_test)
accuracy = accuracy_score(target_test, target_pred)
print(f'Default accuracy: {accuracy}')

Default accuracy: 0.5365252643014229
