In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import collections

In [2]:
complaint_data = pd.read_csv('NYPD_Complaint_Data_Historic.csv', dtype=object)
population_data = pd.read_csv('Population_by_Borough_NYC.csv')
crime_data = pd.read_csv('Crime_Column_Description.csv')

In [3]:
complaint_data = complaint_data.drop(["KY_CD", "PD_CD", "ADDR_PCT_CD", "Latitude", "Longitude", "RPT_DT", "PARKS_NM", "HADEVELOPT", "Lat_Lon", "PREM_TYP_DESC", "LOC_OF_OCCUR_DESC", "PD_DESC", "OFNS_DESC", "CMPLNT_TO_TM", "CMPLNT_TO_DT", "CMPLNT_NUM", "X_COORD_CD", "Y_COORD_CD", "JURIS_DESC", "CRM_ATPT_CPTD_CD"], axis = 1)
complaint_data = complaint_data.dropna()
complaint_data.head(5)

Unnamed: 0,CMPLNT_FR_DT,CMPLNT_FR_TM,LAW_CAT_CD,BORO_NM
0,12/31/2015,23:45:00,FELONY,BRONX
1,12/31/2015,23:36:00,FELONY,QUEENS
2,12/31/2015,23:30:00,FELONY,MANHATTAN
3,12/31/2015,23:30:00,MISDEMEANOR,QUEENS
4,12/31/2015,23:25:00,MISDEMEANOR,MANHATTAN


In [4]:
complaint_data['HOUR'] = complaint_data['CMPLNT_FR_TM'].apply(lambda x: x.split(':')[0])
complaint_data['MINUTE'] = complaint_data['CMPLNT_FR_TM'].apply(lambda x: x.split(':')[1])
complaint_data['MONTH'] = complaint_data['CMPLNT_FR_DT'].apply(lambda x: x.split('/')[0])
complaint_data['DAY'] = complaint_data['CMPLNT_FR_DT'].apply(lambda x: x.split('/')[1])
complaint_data['YEAR'] = complaint_data['CMPLNT_FR_DT'].apply(lambda x: x.split('/')[2])
complaint_data = complaint_data.drop(["CMPLNT_FR_DT", "CMPLNT_FR_TM"], axis=1)
complaint_data.head(5)

Unnamed: 0,LAW_CAT_CD,BORO_NM,HOUR,MINUTE,MONTH,DAY,YEAR
0,FELONY,BRONX,23,45,12,31,2015
1,FELONY,QUEENS,23,36,12,31,2015
2,FELONY,MANHATTAN,23,30,12,31,2015
3,MISDEMEANOR,QUEENS,23,30,12,31,2015
4,MISDEMEANOR,MANHATTAN,23,25,12,31,2015


In [5]:

complaint_data["LAW_CAT_CD"] = pd.factorize(complaint_data['LAW_CAT_CD'])[0]


In [6]:
complaint_data["ACTUAL"] = pd.factorize(complaint_data['BORO_NM'])[0]
complaint_data = complaint_data.drop(['BORO_NM'], axis=1)
complaint_data.head(5)
complaint_data = complaint_data.drop(['MINUTE', 'DAY', 'YEAR'], axis=1)
complaint_data.head()

Unnamed: 0,LAW_CAT_CD,HOUR,MONTH,ACTUAL
0,0,23,12,0
1,0,23,12,1
2,0,23,12,2
3,1,23,12,1
4,1,23,12,2


In [7]:
labels = np.array(complaint_data['ACTUAL'])
complaint_data = complaint_data.drop('ACTUAL', axis = 1)
feature_list = list(complaint_data.columns)
features = np.array(complaint_data)

In [8]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [9]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (15000, 3)
Training Labels Shape: (15000,)
Testing Features Shape: (5000, 3)
Testing Labels Shape: (5000,)


In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn = KNeighborsClassifier(n_neighbors=10)

knn.fit(train_features, train_labels)

y_pred = knn.predict(test_features)

In [11]:
from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(test_labels, y_pred))  
print(classification_report(test_labels, y_pred))  
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

[[314 167 256 345   2]
 [274 201 243 313   2]
 [302 184 306 332   1]
 [430 240 361 485   2]
 [ 54  39  65  80   2]]
              precision    recall  f1-score   support

           0       0.23      0.29      0.26      1084
           1       0.24      0.19      0.22      1033
           2       0.25      0.27      0.26      1125
           3       0.31      0.32      0.32      1518
           4       0.22      0.01      0.02       240

   micro avg       0.26      0.26      0.26      5000
   macro avg       0.25      0.22      0.21      5000
weighted avg       0.26      0.26      0.25      5000

Accuracy: 0.2616


In [12]:
from sklearn.model_selection import cross_val_score
knn_scores = cross_val_score(estimator= knn,     # Model to test
                X= features,  
                y = labels,      # Target variable
                scoring = "accuracy",               # Scoring metric    
                cv=10)                              # Cross validation folds

print("Accuracy per fold: ")
print(knn_scores)
print("Average accuracy: ", knn_scores.mean())

Accuracy per fold: 
[ 0.24975025  0.24075924  0.26486757  0.25837081  0.252       0.25562781
  0.26363182  0.25762881  0.24662331  0.25425425]
Average accuracy:  0.254351388145


In [13]:
from sklearn.tree import DecisionTreeClassifier  
classifier = DecisionTreeClassifier(max_depth=8)  
classifier.fit(train_features, train_labels)
y_pred = classifier.predict(test_features) 

In [14]:
print(confusion_matrix(test_labels, y_pred))  
print(classification_report(test_labels, y_pred))  
print("Accuracy:",metrics.accuracy_score(test_labels, y_pred))

[[  97   81  138  768    0]
 [  83   81  129  740    0]
 [  93   83  142  807    0]
 [ 127  106  187 1098    0]
 [  14   11   37  178    0]]
              precision    recall  f1-score   support

           0       0.23      0.09      0.13      1084
           1       0.22      0.08      0.12      1033
           2       0.22      0.13      0.16      1125
           3       0.31      0.72      0.43      1518
           4       0.00      0.00      0.00       240

   micro avg       0.28      0.28      0.28      5000
   macro avg       0.20      0.20      0.17      5000
weighted avg       0.24      0.28      0.22      5000

Accuracy: 0.2836


  'precision', 'predicted', average, warn_for)


In [15]:
dt_scores = cross_val_score(estimator= classifier,    
                X= features,  
                y = labels,     
                scoring = "accuracy",               
                cv=10)                             

print("Accuracy per fold: ")
print(dt_scores)
print("Average accuracy: ", dt_scores.mean())

Accuracy per fold: 
[ 0.27922078  0.28571429  0.27836082  0.27336332  0.2835      0.28014007
  0.27963982  0.28914457  0.28764382  0.28378378]
Average accuracy:  0.282051127079
