In [111]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import *
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [112]:
_ARREST_KEY = "ARREST_KEY"

_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

_ARREST_DATE = "ARREST_DATE"
_ARREST_MONTH = "ARREST_MONTH" 

_ARREST_PRECINCT = "ARREST_PRECINCT"

_PERPETRATOR_RACE = "PERP_RACE"
_PERPETRATOR_SEX = "PERP_SEX"
_PERPETRATOR_AGE_GROUP="AGE_GROUP"

_INTERNAL_CLASSIFICATION="PD_CD"
_INTERNAL_CLASSIFICATION_DESCRIPTION="PD_DESC"
_THREE_DIGIT_INTERNAL_CLASSIFICATION="KY_CD"

_LAW_CODE="LAW_CODE"

_GEOGRAPHICAL_POSITION = "New Georeferenced Column"

_NYS_X_COORD = "X_COORD_CD"
_NYS_Y_COORD = "Y_COORD_CD"

_LATITUDE='Latitude'
_LONGITUDE='Longitude'


In [113]:
df = pd.read_csv('./cleaned_data.csv')
df.head()


Unnamed: 0,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,ARREST_MONTH
0,FELONY ASSAULT,PL-1211200,F,Q,105,0,40.737043,-73.735514,January
1,FELONY ASSAULT,PL-1200502,F,B,48,0,40.855109,-73.892818,March
2,FELONY ASSAULT,PL-1200512,F,S,121,0,40.628967,-74.163275,May
3,FELONY ASSAULT,PL-1211200,F,Q,100,0,40.59198,-73.800066,June
4,RAPE,PL-1302503,F,M,14,0,40.753533,-73.994537,January


In [114]:
df[_ARREST_PRECINCT] = df[_ARREST_PRECINCT].astype('object')


In [115]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')

df_num.info()
df_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   JURISDICTION_CODE  188682 non-null  int64  
 1   Latitude           188682 non-null  float64
 2   Longitude          188682 non-null  float64
dtypes: float64(2), int64(1)
memory usage: 4.3 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 6 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   OFNS_DESC        188682 non-null  object
 1   LAW_CODE         188682 non-null  object
 2   LAW_CAT_CD       188682 non-null  object
 3   ARREST_BORO      188682 non-null  object
 4   ARREST_PRECINCT  188682 non-null  object
 5   ARREST_MONTH     188682 non-null  object
dtypes: object(6)
memory usage: 8.6+ MB


In [116]:
df_obj[_OFFENSE_LEVEL].unique()
df_obj=df_obj.drop(columns=["OFNS_DESC","LAW_CODE"])# drop some columns to deal with out of mem error

In [117]:
df_obj= pd.get_dummies(df_obj, drop_first=True)
df_obj = df_obj.drop('LAW_CAT_CD_V',axis=1)


In [118]:
# Drop the target column from the data frame
attr = pd.concat([df_obj,df_num],axis=1)


target = (df_obj['LAW_CAT_CD_M'])

print(attr.shape[1])
attr.head()

92


Unnamed: 0,LAW_CAT_CD_M,ARREST_BORO_K,ARREST_BORO_M,ARREST_BORO_Q,ARREST_BORO_S,ARREST_PRECINCT_5,ARREST_PRECINCT_6,ARREST_PRECINCT_7,ARREST_PRECINCT_9,ARREST_PRECINCT_10,...,ARREST_MONTH_February,ARREST_MONTH_January,ARREST_MONTH_July,ARREST_MONTH_June,ARREST_MONTH_March,ARREST_MONTH_May,ARREST_MONTH_September,JURISDICTION_CODE,Latitude,Longitude
0,False,False,False,True,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,40.737043,-73.735514
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,0,40.855109,-73.892818
2,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,True,False,0,40.628967,-74.163275
3,False,False,False,True,False,False,False,False,False,False,...,False,False,False,True,False,False,False,0,40.59198,-73.800066
4,False,False,True,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,0,40.753533,-73.994537


In [119]:
from sklearn.model_selection import train_test_split

# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.20, random_state=5)

In [120]:
attr_train.head()

Unnamed: 0,LAW_CAT_CD_M,ARREST_BORO_K,ARREST_BORO_M,ARREST_BORO_Q,ARREST_BORO_S,ARREST_PRECINCT_5,ARREST_PRECINCT_6,ARREST_PRECINCT_7,ARREST_PRECINCT_9,ARREST_PRECINCT_10,...,ARREST_MONTH_February,ARREST_MONTH_January,ARREST_MONTH_July,ARREST_MONTH_June,ARREST_MONTH_March,ARREST_MONTH_May,ARREST_MONTH_September,JURISDICTION_CODE,Latitude,Longitude
157705,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,0,40.594054,-73.960866
57809,True,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,40.701098,-73.903569
25319,True,False,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,0,40.822306,-73.914843
48111,True,False,False,True,False,False,False,False,False,False,...,True,False,False,False,False,False,False,1,40.604783,-73.753998
61612,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,0,40.871489,-73.904459


In [121]:
from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=bool)
num_selector = make_column_selector(dtype_include=np.number)
num_selector(attr_train)

['JURISDICTION_CODE', 'Latitude', 'Longitude']

In [122]:
cat_selector(attr_train)

['LAW_CAT_CD_M',
 'ARREST_BORO_K',
 'ARREST_BORO_M',
 'ARREST_BORO_Q',
 'ARREST_BORO_S',
 'ARREST_PRECINCT_5',
 'ARREST_PRECINCT_6',
 'ARREST_PRECINCT_7',
 'ARREST_PRECINCT_9',
 'ARREST_PRECINCT_10',
 'ARREST_PRECINCT_13',
 'ARREST_PRECINCT_14',
 'ARREST_PRECINCT_17',
 'ARREST_PRECINCT_18',
 'ARREST_PRECINCT_19',
 'ARREST_PRECINCT_20',
 'ARREST_PRECINCT_22',
 'ARREST_PRECINCT_23',
 'ARREST_PRECINCT_24',
 'ARREST_PRECINCT_25',
 'ARREST_PRECINCT_26',
 'ARREST_PRECINCT_28',
 'ARREST_PRECINCT_30',
 'ARREST_PRECINCT_32',
 'ARREST_PRECINCT_33',
 'ARREST_PRECINCT_34',
 'ARREST_PRECINCT_40',
 'ARREST_PRECINCT_41',
 'ARREST_PRECINCT_42',
 'ARREST_PRECINCT_43',
 'ARREST_PRECINCT_44',
 'ARREST_PRECINCT_45',
 'ARREST_PRECINCT_46',
 'ARREST_PRECINCT_47',
 'ARREST_PRECINCT_48',
 'ARREST_PRECINCT_49',
 'ARREST_PRECINCT_50',
 'ARREST_PRECINCT_52',
 'ARREST_PRECINCT_60',
 'ARREST_PRECINCT_61',
 'ARREST_PRECINCT_62',
 'ARREST_PRECINCT_63',
 'ARREST_PRECINCT_66',
 'ARREST_PRECINCT_67',
 'ARREST_PRECINCT_

In [126]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer

categoricalProcessor = OneHotEncoder(
    handle_unknown="infrequent_if_exist"
)
numericalProcessor = MinMaxScaler()

categoricalPreprocessor = make_column_transformer(
        (categoricalProcessor, cat_selector),
    

)

numericalPreprocessor = make_column_transformer(
        (numericalProcessor, num_selector),
   

)
# See https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html

In [127]:
# Create a pipeline with preprocessing and model
categoricalModel = Pipeline(steps=[('preprocessor', categoricalPreprocessor),
                        ('classifier', CategoricalNB())])

continuousModel = Pipeline(steps=[('preprocessor', numericalPreprocessor),
                        ('classifier', GaussianNB()
)])




In [128]:
# Fit the models
categoricalModel.fit(attr_train, target_train)


In [129]:
continuousModel.fit(attr_train, target_train)

In [136]:
target_pred_categorical = categoricalModel.predict_proba(attr_test)


In [137]:
target_pred_continuous = continuousModel.predict_proba(attr_test)


In [143]:
target_pred_categorical.size

75474

In [142]:
target_pred_continuous.size

75474

In [None]:
combined_probs = (target_pred_categorical + target_pred_continuous) / 2

combined_preds = np.argmax(combined_probs, axis=1)

In [141]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluate the accuracy

#Compute and print the accuracy score.
accuracy = accuracy_score(target_test, combined_preds)
print(f'Accuracy: {accuracy}')
print()

# Print the confusion matrix
print(confusion_matrix(target_test, combined_preds))
print()

#Print the classification report
print('Classification Report')
print(classification_report(target_test, combined_preds))

Accuracy: 1.0

[[16330     0]
 [    0 21407]]

Classification Report
              precision    recall  f1-score   support

       False       1.00      1.00      1.00     16330
        True       1.00      1.00      1.00     21407

    accuracy                           1.00     37737
   macro avg       1.00      1.00      1.00     37737
weighted avg       1.00      1.00      1.00     37737

