In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import ExtraTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.semi_supervised import LabelSpreading
from sklearn import svm
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, accuracy_score, ndcg_score



In [2]:
crimes_df = pd.read_csv ('crimes_clean_time.csv', sep='""', delimiter=',', engine='python')
crimes_df.head(10)

Unnamed: 0,OFFENSE_CODE_GROUP,DISTRICT,SHOOTING,YEAR,MONTH,DAY_OF_WEEK,HOUR,UCR_PART,STREET,Lat,Long,IS_NIGHT,DAY_OF_MONTH,TIME_X,TIME_Y,Lat_M,Long_M
0,Disorderly Conduct,E18,0.0,2018,10,Wednesday,20,Part Two,ARLINGTON ST,42.262608,-71.121186,1,3,0.518198,0.000331,0.185665,0.267386
1,Property Lost,D14,0.0,2018,8,Thursday,20,Part Three,ALLSTON ST,42.352111,-71.135311,1,30,0.239106,0.073462,0.736023,0.201687
2,Other,B2,0.0,2018,10,Wednesday,19,Part Two,DEVON ST,42.308126,-71.07693,1,3,0.51784,0.000318,0.465558,0.473233
3,Aggravated Assault,A1,0.0,2018,10,Wednesday,20,Part One,CAMBRIDGE ST,42.359454,-71.059648,1,3,0.518198,0.000331,0.78117,0.553614
4,Aircraft,A7,0.0,2018,10,Wednesday,20,Part Three,PRESCOTT ST,42.375258,-71.024663,1,3,0.518198,0.000331,0.87835,0.716335
5,Vandalism,C11,0.0,2018,10,Tuesday,20,Part Two,DORCHESTER AVE,42.299197,-71.06047,1,2,0.509594,9.2e-05,0.410652,0.549792
6,Verbal Disputes,B2,0.0,2018,10,Wednesday,19,Part Three,TREMONT ST,42.333807,-71.103778,1,3,0.51784,0.000318,0.623468,0.348354
7,Simple Assault,E18,0.0,2018,10,Wednesday,19,Part Two,AVILA RD,42.256145,-71.128025,1,3,0.51784,0.000318,0.145926,0.235578
8,Towed,D4,0.0,2018,10,Wednesday,20,Part Three,COMMONWEALTH AVE,42.348866,-71.089363,1,3,0.518198,0.000331,0.716067,0.415404
9,Motor Vehicle Accident Response,D14,0.0,2018,10,Wednesday,19,Part Three,FOSTER ST,42.344323,-71.157784,1,3,0.51784,0.000318,0.688134,0.097164


In [3]:
crimes_df = crimes_df.sort_values(['YEAR', 'MONTH', 'DAY_OF_MONTH', 'HOUR'], ascending=[True, True, True, True])

le_ocg = LabelEncoder()
ocg_labels = le_ocg.fit_transform(crimes_df['OFFENSE_CODE_GROUP'])
crimes_df['OFFENSE_CODE_GROUP'] = ocg_labels

le_dw = LabelEncoder()
dw_labels = le_dw.fit_transform(crimes_df['DAY_OF_WEEK'])
crimes_df['DAY_OF_WEEK'] = dw_labels

le_UCR = LabelEncoder()
ucr_labels = le_UCR.fit_transform(crimes_df['UCR_PART'])
crimes_df['UCR_PART'] = ucr_labels

le_district = LabelEncoder()
district_labels = le_district.fit_transform(crimes_df['DISTRICT'])
crimes_df['DISTRICT'] = district_labels

In [4]:
crimes_df_train_test = crimes_df.head(int(crimes_df.shape[0] * 0.8))
crimes_df_validation = crimes_df.tail(crimes_df.shape[0] - int(crimes_df.shape[0] * 0.8))
crimes_df_train_test.drop('YEAR', 1, inplace= True)
crimes_df_validation.drop('YEAR', 1, inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [5]:
# DecisionTreeClassifier

xTrain, xTest, yTrain, yTest = train_test_split(crimes_df_train_test.loc[:,['OFFENSE_CODE_GROUP','OFFENSE_CODE_GROUP','MONTH','DAY_OF_WEEK','HOUR','IS_NIGHT','UCR_PART','DAY_OF_MONTH','TIME_X','TIME_Y']], crimes_df_train_test.loc[:,'DISTRICT'], test_size = 0.25, random_state = 0)
xValidation = crimes_df_validation.loc[:,['OFFENSE_CODE_GROUP','OFFENSE_CODE_GROUP','MONTH','DAY_OF_WEEK','HOUR','IS_NIGHT','UCR_PART','DAY_OF_MONTH','TIME_X','TIME_Y']]
yValidation = crimes_df_validation.loc[:,'DISTRICT']


dec_tree = DecisionTreeClassifier(min_weight_fraction_leaf=0.4, max_depth=30)
dec_tree.fit(xTrain, yTrain)
dec_tree_pred = dec_tree.predict(xTest)
dec_tree_score = f1_score(yTest, dec_tree_pred, average=None)

print('Accuracy: ' + str(dec_tree_score))

print('mean: ' + str(dec_tree_score.mean()))
print('max: ' + str(dec_tree_score.max()))
print('min: ' + str(dec_tree_score.min()))

dec_tree_pred = dec_tree.predict(xValidation)
dec_tree_score = f1_score(yValidation,dec_tree_pred, average=None )

print('Accuracy: ' + str(dec_tree_score))

print('mean: ' + str(dec_tree_score.mean()))
print('max: ' + str(dec_tree_score.max()))
print('min: ' + str(dec_tree_score.min()))

Accuracy: [0.         0.         0.         0.25752779 0.         0.
 0.         0.         0.23728814 0.         0.         0.        ]
mean: 0.041234660104669724
max: 0.2575277856628163
min: 0.0
Accuracy: [0.         0.         0.         0.25206764 0.         0.
 0.         0.         0.23612544 0.         0.         0.        ]
mean: 0.04068275685427391
max: 0.2520676406104152
min: 0.0


In [6]:
ext_tree = ExtraTreeClassifier()
ext_tree = ext_tree.fit(xTrain, yTrain)

ext_tree_pred = ext_tree.predict(xTest)

ext_tree_score = f1_score(yTest, ext_tree_pred, average=None)

print('Accuracy: ' + str(ext_tree_score))

print('mean: ' + str(ext_tree_score.mean()))
print('max: ' + str(ext_tree_score.max()))
print('min: ' + str(ext_tree_score.min()))

ext_tree_pred = ext_tree.predict(xValidation)
ext_tree_score = f1_score(yValidation, ext_tree_pred, average=None)

print('Accuracy: ' + str(ext_tree_score))

print('mean: ' + str(ext_tree_score.mean()))
print('max: ' + str(ext_tree_score.max()))
print('min: ' + str(ext_tree_score.min()))

Accuracy: [0.21116555 0.06157113 0.08819396 0.21561375 0.16928308 0.18676583
 0.11878616 0.09767254 0.18803648 0.09302326 0.08579692 0.08840017]
mean: 0.13369240227814064
max: 0.21561375449820075
min: 0.06157112526539278
Accuracy: [0.15713007 0.03080229 0.05042918 0.16522136 0.13087102 0.1374541
 0.07813707 0.06843906 0.15026012 0.0531386  0.06244117 0.04168439]
mean: 0.09383403582275046
max: 0.16522136010953903
min: 0.030802292263610313


In [7]:
neigh = KNeighborsClassifier()
neigh=neigh.fit(xTrain, yTrain)

neigh_pred=neigh.predict(xTest)

neigh_score=f1_score(yTest, neigh_pred, average=None)

print('Accuracy: ' + str(neigh_score))

print('mean: ' + str(neigh_score.mean()))
print('max: ' + str(neigh_score.max()))
print('min: ' + str(neigh_score.min()))

neigh_pred=neigh.predict(xValidation)
neigh_score=f1_score(yValidation, neigh_pred, average=None)

print('Accuracy: ' + str(neigh_score))

print('mean: ' + str(neigh_score.mean()))
print('max: ' + str(neigh_score.max()))
print('min: ' + str(neigh_score.min()))

Accuracy: [0.21029874 0.04853723 0.06843081 0.22330357 0.161379   0.17387009
 0.08603708 0.07669123 0.17848495 0.05147841 0.05603178 0.04703882]
mean: 0.11513181007062943
max: 0.22330357142857146
min: 0.047038821195806185
Accuracy: [0.18390249 0.02415144 0.05124678 0.19765105 0.12780577 0.13375434
 0.06043046 0.05877357 0.16451725 0.03655244 0.03451076 0.02783964]
mean: 0.09176133267093388
max: 0.19765105166793195
min: 0.024151436031331595


In [8]:
rfc = RandomForestClassifier()
rfc = rfc.fit(xTrain, yTrain)

rfc_pred = rfc.predict(xTest)

rfc_score = f1_score(yTest, rfc_pred, average=None)

print('Accuracy: ' + str(rfc_score))

print('mean: ' + str(rfc_score.mean()))
print('max: ' + str(rfc_score.max()))
print('min: ' + str(rfc_score.min()))

rfc_pred = rfc.predict(xValidation)
rfc_score= f1_score(yValidation, rfc_pred, average=None)


print('Accuracy: ' + str(rfc_score))

print('mean: ' + str(rfc_score.mean()))
print('max: ' + str(rfc_score.max()))
print('min: ' + str(rfc_score.min()))

Accuracy: [0.21821632 0.06320542 0.08475648 0.23122695 0.18851104 0.19975102
 0.12026239 0.11224351 0.21019694 0.10075648 0.10397456 0.09231421]
mean: 0.14378460980492083
max: 0.23122694861076962
min: 0.06320541760722348
Accuracy: [0.17118967 0.02081165 0.04065041 0.19126917 0.14702764 0.15243116
 0.08056929 0.07041649 0.19367705 0.04653533 0.05952381 0.03905724]
mean: 0.10109657530736889
max: 0.19367704850724385
min: 0.02081165452653486


In [9]:
svm = svm.SVC()
svm.fit(xTrain, yTrain)
svm_pred = svm.predict(xTest)

svm_score = f1_score(yTest, svm_pred, average=None)

print('Accuracy: ' + str(svm_score))

print('mean: ' + str(svm_score.mean()))
print('max: ' + str(svm_score.max()))
print('min: ' + str(svm_score.min()))

rfc_pred = rfc.predict(xValidation)
rfc_score= f1_score(yValidation, svm_pred, average=None)


print('Accuracy: ' + str(svm_score))

print('mean: ' + str(svm_score.mean()))
print('max: ' + str(svm_score.max()))
print('min: ' + str(svm_score.min()))

Accuracy: [0.00181077 0.         0.         0.27234563 0.         0.
 0.         0.         0.23329032 0.         0.         0.        ]
mean: 0.04228722746138772
max: 0.27234563285007735
min: 0.0
Accuracy: [0.00181077 0.         0.         0.27234563 0.         0.
 0.         0.         0.23329032 0.         0.         0.        ]
mean: 0.04228722746138772
max: 0.27234563285007735
min: 0.0
