In [22]:
#importing required packages
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


from sklearn.metrics import accuracy_score

In [3]:
#reading data from csv
crime_raw_data = pd.read_csv('data/rpd_crime_2011_to_present.csv')
crime_raw_data.shape

(76228, 40)

In [5]:
#converting city names to upper
crime_raw_data.loc[:,['Address_City']] = crime_raw_data.Address_City.apply(str.upper)

In [76]:
#only getting rochester data
roc_crime_data = crime_raw_data.loc[crime_raw_data.Address_City=='ROCHESTER',:]
roc_crime_data = crime_raw_data.loc[crime_raw_data.Statute_Text!='Larceny',:]
roc_crime_data.shape

(31094, 40)

In [77]:
roc_crime_data = roc_crime_data.dropna()

In [78]:
len(roc_crime_data.Location_Type.unique())

56

In [79]:
roc_crime_data.columns

Index(['X', 'Y', 'OBJECTID', 'Geocode_Address', 'Geocode_Street',
       'Case_Number', 'OccurredFrom_Date_Year', 'OccurredFrom_Date_Month',
       'OccurredFrom_Time', 'OccurredFrom_Timestamp',
       'OccurredThrough_Date_Year', 'OccurredThrough_Date_Month',
       'OccurredThrough_Time', 'OccurredThrough_Timestamp',
       'Reported_Date_Year', 'Reported_Date_Month', 'Reported_Time',
       'Reported_Timestamp', 'Address_StreetFull', 'Address_City',
       'Address_State', 'Patrol_Beat', 'Patrol_Section', 'Case_Status',
       'Statute_Title', 'Statute_Section', 'Statute_Subsection',
       'Statute_Degree', 'Statute_Class', 'Statute_Category', 'Statute_Text',
       'Statute_Description', 'Statute_CrimeCategory', 'Statute_Attempted',
       'Weapon_Description', 'Larceny_Type', 'Location_Type', 'Geo_Beat',
       'Geo_Section', 'Geo_Section_Num'],
      dtype='object')

In [80]:
roc_crime_data_features = roc_crime_data.loc[:,['X','Y','OccurredFrom_Date_Month','OccurredFrom_Time']]
label = roc_crime_data.loc[:,['Statute_Text']]

In [81]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(roc_crime_data_features, label, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=label)

In [82]:
X_train.shape

(24693, 4)

In [83]:
X_test.shape

(6174, 4)

In [84]:
#creating pipeline
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         DecisionTreeClassifier(random_state=0))

In [85]:
#need to add more
hyperparameters = { 'decisiontreeclassifier__criterion' : ['gini', 'entropy'],
                  'decisiontreeclassifier__max_depth': [None, 100, 50, 10],
                  'decisiontreeclassifier__splitter': ['best','random']}



In [86]:
#add time
clf = GridSearchCV(pipeline, hyperparameters, cv=10)
 
# Fit and tune model
clf.fit(X_train, y_train.Statute_Text.values)

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('decisiontreeclassifier', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=0, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'decisiontreeclassifier__max_depth': [None, 100, 50, 10], 'decisiontreeclassifier__criterion': ['gini', 'entropy'], 'decisiontreeclassifier__splitter': ['best', 'random']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [88]:
print(clf.best_params_)

{'decisiontreeclassifier__max_depth': 10, 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__splitter': 'best'}


In [89]:
print(clf.refit)

True


In [90]:
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

In [91]:

np.unique(y_pred)


array(['Aggravated Assault', 'Burglary', 'Motor Vehicle Theft', 'Murder',
       'Robbery'], dtype=object)

In [92]:
pd.Series(y_pred).value_counts()

Burglary               4751
Aggravated Assault      997
Robbery                 280
Motor Vehicle Theft     142
Murder                    4
dtype: int64

In [93]:
accuracy_score(y_test, y_pred)

0.50194363459669578

In [94]:
pd.DataFrame(y_pred_prob)


Unnamed: 0,0,1,2,3,4,5
0,0.233486,0.473059,0.109893,0.008219,0.000304,0.175038
1,0.233486,0.473059,0.109893,0.008219,0.000304,0.175038
2,0.233486,0.473059,0.109893,0.008219,0.000304,0.175038
3,0.255223,0.363162,0.161908,0.009749,0.000000,0.209958
4,0.233486,0.473059,0.109893,0.008219,0.000304,0.175038
5,0.361722,0.235407,0.111005,0.006699,0.003828,0.281340
6,0.121429,0.728571,0.082143,0.003571,0.000000,0.064286
7,0.156682,0.364055,0.313364,0.000000,0.004608,0.161290
8,0.179211,0.612903,0.094982,0.007168,0.000000,0.105735
9,0.454545,0.136364,0.045455,0.000000,0.000000,0.363636


In [95]:
clf.best_estimator_.classes_

array(['Aggravated Assault', 'Burglary', 'Motor Vehicle Theft', 'Murder',
       'Non-Negligent Manslaughter', 'Robbery'], dtype=object)