In [129]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import *
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector,make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [130]:
_ARREST_KEY = "ARREST_KEY"

_ARREST_BORO = "ARREST_BORO"
_JURISDICTION_CODE = "JURISDICTION_CODE"
_OFFENSE_LEVEL = "LAW_CAT_CD"

_ARREST_DATE = "ARREST_DATE"
_ARREST_MONTH = "ARREST_MONTH" 

_ARREST_PRECINCT = "ARREST_PRECINCT"

_PERPETRATOR_RACE = "PERP_RACE"
_PERPETRATOR_SEX = "PERP_SEX"
_PERPETRATOR_AGE_GROUP="AGE_GROUP"

_INTERNAL_CLASSIFICATION="PD_CD"
_INTERNAL_CLASSIFICATION_DESCRIPTION="PD_DESC"
_THREE_DIGIT_INTERNAL_CLASSIFICATION="KY_CD"

_LAW_CODE="LAW_CODE"

_GEOGRAPHICAL_POSITION = "New Georeferenced Column"

_NYS_X_COORD = "X_COORD_CD"
_NYS_Y_COORD = "Y_COORD_CD"

_LATITUDE='Latitude'
_LONGITUDE='Longitude'


In [131]:
df = pd.read_csv('./cleaned_data.csv')
df.head()


Unnamed: 0,OFNS_DESC,LAW_CODE,LAW_CAT_CD,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,Latitude,Longitude,ARREST_MONTH
0,FELONY ASSAULT,PL-1211200,F,Q,105,0,40.737043,-73.735514,January
1,FELONY ASSAULT,PL-1200502,F,B,48,0,40.855109,-73.892818,March
2,FELONY ASSAULT,PL-1200512,F,S,121,0,40.628967,-74.163275,May
3,FELONY ASSAULT,PL-1211200,F,Q,100,0,40.59198,-73.800066,June
4,RAPE,PL-1302503,F,M,14,0,40.753533,-73.994537,January


In [132]:
df[_ARREST_PRECINCT] = df[_ARREST_PRECINCT].astype('object')
df[_JURISDICTION_CODE] = df[_JURISDICTION_CODE].astype('object')

In [133]:
df_num= df.select_dtypes(exclude='object')
df_obj= df.select_dtypes(include='object')

df_num.info()
df_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Latitude   188682 non-null  float64
 1   Longitude  188682 non-null  float64
dtypes: float64(2)
memory usage: 2.9 MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188682 entries, 0 to 188681
Data columns (total 7 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   OFNS_DESC          188682 non-null  object
 1   LAW_CODE           188682 non-null  object
 2   LAW_CAT_CD         188682 non-null  object
 3   ARREST_BORO        188682 non-null  object
 4   ARREST_PRECINCT    188682 non-null  object
 5   JURISDICTION_CODE  188682 non-null  object
 6   ARREST_MONTH       188682 non-null  object
dtypes: object(7)
memory usage: 10.1+ MB


In [134]:

df_obj[_OFFENSE_LEVEL].unique()
#df_obj=df_obj.drop(columns=["OFNS_DESC","LAW_CODE"])# drop some columns to deal with out of mem error
#!!! these features make a huge diff in accuracy.

array(['F', 'M', 'V'], dtype=object)

In [135]:
# df_obj= pd.get_dummies(df_obj, drop_first=True)

# target = (df_obj['LAW_CAT_CD_M'])
# df_obj = df_obj.drop(columns=['LAW_CAT_CD_M'],axis=1)
# df_obj
target = df[_OFFENSE_LEVEL]
df_obj=df_obj.drop(columns=[_OFFENSE_LEVEL])

In [None]:
attr = pd.concat([df_obj,df_num],axis=1)


print(attr.shape[1])
attr.head()

8


Unnamed: 0,OFNS_DESC,LAW_CODE,ARREST_BORO,ARREST_PRECINCT,JURISDICTION_CODE,ARREST_MONTH,Latitude,Longitude
0,FELONY ASSAULT,PL-1211200,Q,105,0,January,40.737043,-73.735514
1,FELONY ASSAULT,PL-1200502,B,48,0,March,40.855109,-73.892818
2,FELONY ASSAULT,PL-1200512,S,121,0,May,40.628967,-74.163275
3,FELONY ASSAULT,PL-1211200,Q,100,0,June,40.59198,-73.800066
4,RAPE,PL-1302503,M,14,0,January,40.753533,-73.994537


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test
attr_train, attr_test, target_train, target_test = train_test_split(attr, target, test_size=0.20, random_state=5)

In [139]:
from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
num_selector(attr_train)

['Latitude', 'Longitude']

In [152]:
cat_selector(attr_train)

['OFNS_DESC',
 'LAW_CODE',
 'ARREST_BORO',
 'ARREST_PRECINCT',
 'JURISDICTION_CODE',
 'ARREST_MONTH']

In [156]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import make_column_transformer
# Scaling on Latitude and Longitude because it's hard to make the pipeline without it
categoricalProcessor = OneHotEncoder(
    handle_unknown="infrequent_if_exist",sparse_output=False #MUST NOT BE SPARSE
)
numericalProcessor = MinMaxScaler()

categoricalPreprocessor = make_column_transformer(
        (categoricalProcessor, cat_selector(attr_train)),
    

)

numericalPreprocessor = make_column_transformer(
        (numericalProcessor, num_selector(attr_train)),
)
# See https://scikit-learn.org/stable/auto_examples/ensemble/plot_stack_predictors.html

In [157]:
# Create a pipeline with preprocessing and model
categoricalModel = Pipeline(steps=[('preprocessor', categoricalPreprocessor),
                        ('classifier', CategoricalNB())])

continuousModel = Pipeline(steps=[
     ('preprocessor', numericalPreprocessor),
                        ('classifier', GaussianNB()
)])




In [167]:
from sklearn.ensemble import StackingClassifier
stacking_classifier = StackingClassifier(
    estimators=[
        ('num_pipeline', continuousModel),
        ('cat_pipeline', categoricalModel)
    ]
)
stacking_classifier.fit(attr_train, target_train)
predictions = stacking_classifier.predict(attr_test)
print(predictions)

['M' 'M' 'M' ... 'M' 'M' 'F']


In [168]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluate the accuracy

#Compute and print the accuracy score.
accuracy = accuracy_score(target_test, predictions)
print(f'Accuracy: {accuracy}')
print()

# Print the confusion matrix
print(confusion_matrix(target_test, predictions))
print()

#Print the classification report
print('Classification Report')
print(classification_report(target_test, predictions))

Accuracy: 0.9978535654662533

[[15960    30    21]
 [   13 21391     3]
 [   14     0   305]]

Classification Report
              precision    recall  f1-score   support

           F       1.00      1.00      1.00     16011
           M       1.00      1.00      1.00     21407
           V       0.93      0.96      0.94       319

    accuracy                           1.00     37737
   macro avg       0.97      0.98      0.98     37737
weighted avg       1.00      1.00      1.00     37737



In [None]:
###Parking lot for stuff I tried before

In [158]:
# Fit the models
categoricalModel.fit(attr_train, target_train)


In [159]:
continuousModel.fit(attr_train, target_train)

In [160]:
target_pred_categorical = categoricalModel.predict_proba(attr_test)


In [161]:
target_pred_continuous = continuousModel.predict_proba(attr_test)


In [162]:
target_pred_categorical.size


113211

In [163]:
target_pred_continuous.size

113211

In [164]:
combined_probs = (target_pred_categorical + target_pred_continuous) / 2

combined_preds = np.argmax(combined_probs, axis=1)
combined_preds.size

37737

In [165]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluate the accuracy

#Compute and print the accuracy score.
accuracy = accuracy_score(target_test, combined_preds)
print(f'Accuracy: {accuracy}')
print()

# Print the confusion matrix
print(confusion_matrix(target_test, combined_preds))
print()

#Print the classification report
print('Classification Report')
print(classification_report(target_test, combined_preds))

Accuracy: 0.0



ValueError: Mix of label input types (string and number)