In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import *
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [46]:
_ARREST_KEY = "ARREST_KEY"

_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

_ARREST_DATE = "ARREST_DATE"
_ARREST_MONTH = "ARREST_MONTH" 

_ARREST_PRECINCT = "ARREST_PRECINCT"

_PERPETRATOR_RACE = "PERP_RACE"
_PERPETRATOR_SEX = "PERP_SEX"
_PERPETRATOR_AGE_GROUP="AGE_GROUP"

_INTERNAL_CLASSIFICATION="PD_CD"
_INTERNAL_CLASSIFICATION_DESCRIPTION="PD_DESC"
_THREE_DIGIT_INTERNAL_CLASSIFICATION="KY_CD"

_LAW_CODE="LAW_CODE"

_GEOGRAPHICAL_POSITION = "New Georeferenced Column"

_NYS_X_COORD = "X_COORD_CD"
_NYS_Y_COORD = "Y_COORD_CD"

_LATITUDE='Latitude'
_LONGITUDE='Longitude'


In [47]:
df = pd.read_csv('./cleaned_data.csv')
df.head()


Unnamed: 0,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,ARREST_MONTH
0,FELONY ASSAULT,PL-1211200,F,Q,105,0,40.737043,-73.735514,January
1,FELONY ASSAULT,PL-1200502,F,B,48,0,40.855109,-73.892818,March
2,FELONY ASSAULT,PL-1200512,F,S,121,0,40.628967,-74.163275,May
3,FELONY ASSAULT,PL-1211200,F,Q,100,0,40.59198,-73.800066,June
4,RAPE,PL-1302503,F,M,14,0,40.753533,-73.994537,January


In [48]:
df[_ARREST_PRECINCT] = df[_ARREST_PRECINCT].astype('object')


In [49]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')

df_num.info()
df_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   JURISDICTION_CODE  188682 non-null  int64  
 1   Latitude           188682 non-null  float64
 2   Longitude          188682 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 4.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   OFNS_DESC        188682 non-null  object
 1   LAW_CODE         188682 non-null  object
 2   LAW_CAT_CD       188682 non-null  object
 3   ARREST_BORO      188682 non-null  object
 4   ARREST_PRECINCT  188682 non-null  object
 5   ARREST_MONTH     188682 non-null  object
dtypes: object(6)
memory usage: 8.6+ MB


In [None]:

df_obj[_OFFENSE_LEVEL].unique()
#df_obj=df_obj.drop(columns=["OFNS_DESC","LAW_CODE"])# drop some columns to deal with out of mem error
#!!! these features make a huge diff in accuracy.

array(['F', 'M', 'V'], dtype=object)

In [51]:
df_obj= pd.get_dummies(df_obj, drop_first=True)

target = (df_obj['LAW_CAT_CD_M'])
df_obj = df_obj.drop(columns=['LAW_CAT_CD_M'],axis=1)
df_obj

Unnamed: 0,OFNS_DESC_ADMINISTRATIVE CODES,OFNS_DESC_ALCOHOLIC BEVERAGE CONTROL LAW,OFNS_DESC_ANTICIPATORY OFFENSES,OFNS_DESC_ARSON,OFNS_DESC_ASSAULT 3 & RELATED OFFENSES,OFNS_DESC_BURGLAR'S TOOLS,OFNS_DESC_BURGLARY,OFNS_DESC_CANNABIS RELATED OFFENSES,OFNS_DESC_CHILD ABANDONMENT/NON SUPPORT 1,OFNS_DESC_CRIMINAL MISCHIEF & RELATED OF,...,ARREST_PRECINCT_122,ARREST_PRECINCT_123,ARREST_MONTH_August,ARREST_MONTH_February,ARREST_MONTH_January,ARREST_MONTH_July,ARREST_MONTH_June,ARREST_MONTH_March,ARREST_MONTH_May,ARREST_MONTH_September
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188677,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
188678,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
188679,False,False,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
188680,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [52]:
attr = pd.concat([df_obj,df_num],axis=1)


print(attr.shape[1])
attr.head()

1103


Unnamed: 0,OFNS_DESC_ADMINISTRATIVE CODES,OFNS_DESC_ALCOHOLIC BEVERAGE CONTROL LAW,OFNS_DESC_ANTICIPATORY OFFENSES,OFNS_DESC_ARSON,OFNS_DESC_ASSAULT 3 & RELATED OFFENSES,OFNS_DESC_BURGLAR'S TOOLS,OFNS_DESC_BURGLARY,OFNS_DESC_CANNABIS RELATED OFFENSES,OFNS_DESC_CHILD ABANDONMENT/NON SUPPORT 1,OFNS_DESC_CRIMINAL MISCHIEF & RELATED OF,...,ARREST_MONTH_February,ARREST_MONTH_January,ARREST_MONTH_July,ARREST_MONTH_June,ARREST_MONTH_March,ARREST_MONTH_May,ARREST_MONTH_September,JURISDICTION_CODE,Latitude,Longitude
0,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,40.737043,-73.735514
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,0,40.855109,-73.892818
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,0,40.628967,-74.163275
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,0,40.59198,-73.800066
4,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,40.753533,-73.994537


In [53]:
from sklearn.model_selection import train_test_split

# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.20, random_state=5)

In [54]:
attr_train.head()

Unnamed: 0,OFNS_DESC_ADMINISTRATIVE CODES,OFNS_DESC_ALCOHOLIC BEVERAGE CONTROL LAW,OFNS_DESC_ANTICIPATORY OFFENSES,OFNS_DESC_ARSON,OFNS_DESC_ASSAULT 3 & RELATED OFFENSES,OFNS_DESC_BURGLAR'S TOOLS,OFNS_DESC_BURGLARY,OFNS_DESC_CANNABIS RELATED OFFENSES,OFNS_DESC_CHILD ABANDONMENT/NON SUPPORT 1,OFNS_DESC_CRIMINAL MISCHIEF & RELATED OF,...,ARREST_MONTH_February,ARREST_MONTH_January,ARREST_MONTH_July,ARREST_MONTH_June,ARREST_MONTH_March,ARREST_MONTH_May,ARREST_MONTH_September,JURISDICTION_CODE,Latitude,Longitude
157705,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,40.594054,-73.960866
57809,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,40.701098,-73.903569
25319,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,40.822306,-73.914843
48111,False,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,1,40.604783,-73.753998
61612,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,0,40.871489,-73.904459


In [55]:
from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=bool)
num_selector = make_column_selector(dtype_include=np.number)
num_selector(attr_train)

['JURISDICTION_CODE', 'Latitude', 'Longitude']

In [56]:
cat_selector(attr_train)

['OFNS_DESC_ADMINISTRATIVE CODES',
 'OFNS_DESC_ALCOHOLIC BEVERAGE CONTROL LAW',
 'OFNS_DESC_ANTICIPATORY OFFENSES',
 'OFNS_DESC_ARSON',
 'OFNS_DESC_ASSAULT 3 & RELATED OFFENSES',
 "OFNS_DESC_BURGLAR'S TOOLS",
 'OFNS_DESC_BURGLARY',
 'OFNS_DESC_CANNABIS RELATED OFFENSES',
 'OFNS_DESC_CHILD ABANDONMENT/NON SUPPORT 1',
 'OFNS_DESC_CRIMINAL MISCHIEF & RELATED OF',
 'OFNS_DESC_CRIMINAL TRESPASS',
 'OFNS_DESC_DANGEROUS DRUGS',
 'OFNS_DESC_DANGEROUS WEAPONS',
 'OFNS_DESC_DISORDERLY CONDUCT',
 'OFNS_DESC_DISRUPTION OF A RELIGIOUS SERV',
 'OFNS_DESC_ESCAPE 3',
 'OFNS_DESC_FELONY ASSAULT',
 'OFNS_DESC_FORGERY',
 'OFNS_DESC_FORTUNE TELLING',
 'OFNS_DESC_FRAUDS',
 'OFNS_DESC_FRAUDULENT ACCOSTING',
 'OFNS_DESC_GAMBLING',
 'OFNS_DESC_GRAND LARCENY',
 'OFNS_DESC_GRAND LARCENY OF MOTOR VEHICLE',
 'OFNS_DESC_HARRASSMENT 2',
 'OFNS_DESC_HOMICIDE-NEGLIGENT,UNCLASSIFIE',
 'OFNS_DESC_HOMICIDE-NEGLIGENT-VEHICLE',
 'OFNS_DESC_INTOXICATED & IMPAIRED DRIVING',
 'OFNS_DESC_INTOXICATED/IMPAIRED DRIVING',
 'OFNS_

In [57]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer

categoricalProcessor = OneHotEncoder(
    handle_unknown="infrequent_if_exist"
)
numericalProcessor = MinMaxScaler()

categoricalPreprocessor = make_column_transformer(
        (categoricalProcessor, cat_selector),
    

)

numericalPreprocessor = make_column_transformer(
        (numericalProcessor, num_selector),
   

)
# See https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html

In [58]:
# Create a pipeline with preprocessing and model
categoricalModel = Pipeline(steps=[('preprocessor', categoricalPreprocessor),
                        ('classifier', CategoricalNB())])

continuousModel = Pipeline(steps=[('preprocessor', numericalPreprocessor),
                        ('classifier', GaussianNB()
)])




In [59]:
# Fit the models
categoricalModel.fit(attr_train, target_train)


In [60]:
continuousModel.fit(attr_train, target_train)

In [61]:
target_pred_categorical = categoricalModel.predict_proba(attr_test)


In [62]:
target_pred_continuous = continuousModel.predict_proba(attr_test)


In [63]:
target_pred_categorical.size

75474

In [64]:
target_pred_continuous.size

75474

In [65]:
combined_probs = (target_pred_categorical + target_pred_continuous) / 2

combined_preds = np.argmax(combined_probs, axis=1)

In [66]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluate the accuracy

#Compute and print the accuracy score.
accuracy = accuracy_score(target_test, combined_preds)
print(f'Accuracy: {accuracy}')
print()

# Print the confusion matrix
print(confusion_matrix(target_test, combined_preds))
print()

#Print the classification report
print('Classification Report')
print(classification_report(target_test, combined_preds))

Accuracy: 0.9987545379865914

[[16296    34]
 [   13 21394]]

Classification Report
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     16330
        True       1.00      1.00      1.00     21407

    accuracy                           1.00     37737
   macro avg       1.00      1.00      1.00     37737
weighted avg       1.00      1.00      1.00     37737

