In [88]:
#reference :  https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
#importing required packages
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV


from sklearn.metrics import accuracy_score
from imblearn.under_sampling import RandomUnderSampler

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

#module to save models
from sklearn.externals import joblib

#importing tabpy client
import tabpy_client

In [2]:
#reading data from csv
crime_raw_data = pd.read_csv('data/rpd_crime_2011_to_present.csv')
crime_raw_data.shape

(76228, 40)

In [3]:
#converting city names to upper
crime_raw_data.loc[:,['Address_City']] = crime_raw_data.Address_City.apply(str.upper)

In [4]:
#only getting rochester data
roc_crime_data = crime_raw_data.loc[crime_raw_data.Address_City=='ROCHESTER',:]
#roc_crime_data = crime_raw_data.loc[crime_raw_data.Statute_Text!='Larceny',:]
roc_crime_data.shape

(75468, 40)

In [5]:
roc_crime_data = roc_crime_data.dropna()

In [6]:
len(roc_crime_data.Location_Type.unique())

56

In [7]:
roc_crime_data.columns

Index(['X', 'Y', 'OBJECTID', 'Geocode_Address', 'Geocode_Street',
       'Case_Number', 'OccurredFrom_Date_Year', 'OccurredFrom_Date_Month',
       'OccurredFrom_Time', 'OccurredFrom_Timestamp',
       'OccurredThrough_Date_Year', 'OccurredThrough_Date_Month',
       'OccurredThrough_Time', 'OccurredThrough_Timestamp',
       'Reported_Date_Year', 'Reported_Date_Month', 'Reported_Time',
       'Reported_Timestamp', 'Address_StreetFull', 'Address_City',
       'Address_State', 'Patrol_Beat', 'Patrol_Section', 'Case_Status',
       'Statute_Title', 'Statute_Section', 'Statute_Subsection',
       'Statute_Degree', 'Statute_Class', 'Statute_Category', 'Statute_Text',
       'Statute_Description', 'Statute_CrimeCategory', 'Statute_Attempted',
       'Weapon_Description', 'Larceny_Type', 'Location_Type', 'Geo_Beat',
       'Geo_Section', 'Geo_Section_Num'],
      dtype='object')

In [8]:
roc_crime_data_features = roc_crime_data.loc[:,['X','Y','OccurredFrom_Date_Month','OccurredFrom_Time']]
label = roc_crime_data.loc[:,['Statute_Text']]

In [9]:
label.Statute_Text.value_counts()

Larceny                       44441
Burglary                      14827
Aggravated Assault             6350
Robbery                        5106
Motor Vehicle Theft            4149
Murder                          222
Non-Negligent Manslaughter       12
Name: Statute_Text, dtype: int64

In [11]:
#http://contrib.scikit-learn.org/imbalanced-learn/stable/auto_examples/under-sampling/plot_random_under_sampler.html

# Apply the random under-sampling
rus = RandomUnderSampler(ratio={'Larceny' : 20000},return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(roc_crime_data_features, label.Statute_Text.values)

In [12]:
pd.Series(y_resampled).value_counts()

Larceny                       20000
Burglary                      14827
Aggravated Assault             6350
Robbery                        5106
Motor Vehicle Theft            4149
Murder                          222
Non-Negligent Manslaughter       12
dtype: int64

In [13]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(roc_crime_data_features, label, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=label)

In [14]:
#Baseline model
#Since larceny is the highest crime type, check the accuracy by predicting all events as 'Larceny'

print(np.mean(y_test.Statute_Text == 'Larceny'))
np.mean(y_resampled == 'Larceny')

0.591732126215


0.3947420360794221

In [82]:
X_train.shape

(24693, 4)

In [83]:
X_test.shape

(6174, 4)

In [59]:
#creating pipeline for decision tree
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         DecisionTreeClassifier(random_state=0))

In [60]:
#need to add more
hyperparameters = { 'decisiontreeclassifier__criterion' : ['gini', 'entropy'],
                  'decisiontreeclassifier__max_depth': [None, 100, 50, 10],
                  'decisiontreeclassifier__splitter': ['best','random']}



In [33]:
#add time
def gridSearch(pipeline,hyperparameters):
    clft = GridSearchCV(pipeline, hyperparameters, cv=10)
    # Fit and tune model
    #clft.fit(X_train, y_train.Statute_Text.values)
    clft.fit(X_resampled, y_resampled)
    return clft

In [62]:
clf = gridSearch(pipeline,hyperparameters)

In [63]:
print(clf.best_params_)

{'decisiontreeclassifier__splitter': 'best', 'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_depth': 10}


In [64]:
print(clf.refit)

True


In [65]:
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

In [42]:
np.unique(y_pred)

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Robbery'], dtype=object)

In [66]:
pd.Series(y_pred).value_counts()

Larceny                7951
Burglary               5421
Aggravated Assault     1250
Robbery                 284
Motor Vehicle Theft     115
Murder                    1
dtype: int64

In [67]:
accuracy_score(y_test, y_pred)

0.54207162827852484

In [68]:
pd.DataFrame(y_pred_prob)


Unnamed: 0,0,1,2,3,4,5,6
0,0.077143,0.200000,0.557143,0.100000,0.000000,0.000000,0.065714
1,0.015385,0.553846,0.276923,0.092308,0.000000,0.015385,0.046154
2,0.142857,0.420635,0.261905,0.087302,0.000000,0.000000,0.087302
3,0.000000,0.026316,0.973684,0.000000,0.000000,0.000000,0.000000
4,0.036674,0.246908,0.614925,0.048614,0.000853,0.000000,0.052026
5,0.136252,0.361121,0.315937,0.072504,0.006305,0.000350,0.107531
6,0.064722,0.165907,0.599818,0.083865,0.001823,0.000000,0.083865
7,0.000000,0.142857,0.755102,0.071429,0.000000,0.000000,0.030612
8,0.150307,0.297546,0.331288,0.092025,0.006135,0.000000,0.122699
9,0.218009,0.248815,0.306872,0.082938,0.010664,0.000000,0.132701


In [45]:
clf.best_estimator_.classes_

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Non-Negligent Manslaughter', 'Robbery'], dtype=object)

In [101]:
#pipeline for random forests
pipeline2 = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestClassifier(max_features=None))

In [102]:
hyperparameters2 = { 'randomforestclassifier__criterion' : ['gini', 'entropy'],
                  'randomforestclassifier__max_depth': [None, 100, 50, 10, 5]}


In [103]:
clf2 = gridSearch(pipeline2,hyperparameters2)

In [105]:
print(clf2.best_params_)
print(clf2.refit)


{'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_depth': 10}
True


In [106]:
y_pred = clf2.predict(X_test)
y_pred_prob = clf2.predict_proba(X_test)

In [107]:
pd.Series(y_pred).value_counts()

Larceny                8718
Burglary               5057
Aggravated Assault     1019
Robbery                 184
Motor Vehicle Theft      42
Murder                    2
dtype: int64

In [108]:
accuracy_score(y_test, y_pred)

0.59099986686193584

In [90]:
#pipeline for adaboost
pipeline3 = make_pipeline(preprocessing.StandardScaler(), 
                         AdaBoostClassifier())

In [91]:
hyperparameters3 = {  'adaboostclassifier__algorithm' : ['SAMME', 'SAMME.R'],
                  'adaboostclassifier__n_estimators': [100, 50, 35, 25, 10]}


In [92]:
clf3 = gridSearch(pipeline3,hyperparameters3)

In [93]:
print(clf3.best_params_)
print(clf3.refit)

{'adaboostclassifier__algorithm': 'SAMME.R', 'adaboostclassifier__n_estimators': 25}
True


In [94]:
y_pred = clf3.predict(X_test)
y_pred_prob = clf3.predict_proba(X_test)

In [95]:
pd.Series(y_pred).value_counts()

Larceny                       11503
Burglary                       3209
Aggravated Assault              283
Non-Negligent Manslaughter       27
dtype: int64

In [96]:
accuracy_score(y_test, y_pred)

0.54393556117694053

In [34]:
#pipeline for nearest neighbors
pipeline4 = make_pipeline(preprocessing.StandardScaler(), 
                         KNeighborsClassifier())

In [35]:
hyperparameters4 = {  'kneighborsclassifier__n_neighbors' : [4,5,6,7],
                  'kneighborsclassifier__weights': ['distance'],
                   'kneighborsclassifier__algorithm': ['auto']}


In [36]:
clf4 = gridSearch(pipeline4,hyperparameters4)

In [37]:
print(clf4.best_params_)
print(clf4.refit)

{'kneighborsclassifier__n_neighbors': 7, 'kneighborsclassifier__weights': 'distance', 'kneighborsclassifier__algorithm': 'auto'}
True


In [206]:
y_pred = clf4.predict(X_test)
y_pred_prob = clf4.predict_proba(X_test)

In [207]:
pd.Series(y_pred).value_counts()

Larceny                       6908
Burglary                      4236
Aggravated Assault            1589
Robbery                       1237
Motor Vehicle Theft           1002
Murder                          47
Non-Negligent Manslaughter       3
dtype: int64

In [85]:
accuracy_score(y_test, y_pred)

0.86672879776328049

In [86]:
#saving the model
joblib.dump(clf4, 'trained_model/knn_classifier.pkl')

['trained_model/knn_classifier.pkl']

In [87]:
#reloading and test
clf5 = joblib.load('trained_model/knn_classifier.pkl')
y_pred = clf5.predict(X_test)
y_pred_prob = clf5.predict_proba(X_test)
accuracy_score(y_test, y_pred)

0.86672879776328049

In [211]:
import geocoder
g = geocoder.google('94, E Squire Dr, Rochester, NY , 14623')
g.latlng

'14623'

In [81]:
def get_data():
    input_add = input("Enter address, comma separated")
    lat_long = geocoder.google(input_add).latlng
    print(lat_long)
    X = float(lat_long[1])
    Y = float(lat_long[0])
    month = int(input("Enter Month of Year : 1-12"))
    time_24hr = int(input("Enter time : 24hr clock"))
    
    input_data = {'X' : X, 'Y' : Y, 'OccurredFrom_Date_Month' : month, 'OccurredFrom_Time' : time_24hr}
    l = clf4.predict_proba(pd.DataFrame([input_data]))[0].tolist()
    z = zip(clf4.classes_.tolist(),l)
    for i in z:
        print(i)
    return z

    

In [59]:
clf4.classes_

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Non-Negligent Manslaughter', 'Robbery'], dtype=object)

In [83]:
output_data = get_data()

Enter address, comma separated100,clinton ave south, rochester, 14604
[43.1545777, -77.6046532]
Enter Month of Year : 1-126
Enter time : 24hr clock1400
('Aggravated Assault', 0.0)
('Burglary', 0.4285715962105395)
('Larceny', 0.4285712650657843)
('Motor Vehicle Theft', 0.1428571387236762)
('Murder', 0.0)
('Non-Negligent Manslaughter', 0.0)
('Robbery', 0.0)


In [200]:
#defining function to work with tableau, it gets input from tableau
def crimeKnnClassifier(address,month,time_24hr):
    clf = joblib.load('/Users/ajaykliyara/Documents/cs63/final_project/trained_model/knn_classifier.pkl')
    lat_long = geocoder.google(address).latlng
    #print(clf)
    #print(lat_long)
    X = float(lat_long[1])
    Y = float(lat_long[0])
    input_data = {'X' : X, 'Y' : Y, 'OccurredFrom_Date_Month' : month, 'OccurredFrom_Time' : time_24hr}
    input_data = pd.DataFrame([input_data])
    input_data = input_data.loc[:,['X','Y','OccurredFrom_Date_Month','OccurredFrom_Time']]
    l = clf.predict_proba(input_data)[0].tolist()
    z = zip(clf.classes_.tolist(),l)
    #print(z)
    lst=[]
    for i in z:
        lst.append(i[1])
        
    return lst

In [201]:
result = crimeKnnClassifier('100,Clinton Ave S,Rochester, 14604',6,1400)
result

[0.0,
 0.17648563270829834,
 0.5757851755129982,
 0.0,
 0.0,
 0.0,
 0.24772919177870334]

In [202]:
result = crimeKnnClassifier('798,Jay St,Rochester,NY, 14611',2,1815)
result

[0.0,
 0.8166390472161272,
 0.0,
 0.10465289415123745,
 0.0,
 0.0,
 0.07870805863263539]

In [97]:
#reference : https://github.com/tableau/TabPy/blob/master/client.md
client = tabpy_client.Client('http://localhost:9004/')

In [203]:
from tabpy_client.schema import generate_schema

schema = generate_schema(
  input={'address': '100,Clinton Ave S, Rochester, NY, 14604',
         'month': 6,
         'time_24hr' : 1400},
  output=[0.0,0.2,0.4,0.2,0.2],
  input_description={'address': 'address separated by comma',
                     'month': 'month of the year',
                     'time_24hr' : 'time of the day'},
  output_description='probability of a crime type at the given location, month and time')

In [204]:
client.deploy('crimeKnnClassifier', crimeKnnClassifier, 
              'Returns probabilites of Crime time for given address, month and time of day', schema = schema, override=True)

In [205]:
client.query('crimeKnnClassifier','100,Clinton Ave S, Rochester, NY, 14604', 6, 1400)

{'model': 'crimeKnnClassifier',
 'response': [0.0,
  0.17648563270829834,
  0.5757851755129982,
  0.0,
  0.0,
  0.0,
  0.24772919177870334],
 'uuid': '527b36b3-ed8a-441d-89f7-80183d27a2b8',
 'version': 15}

In [123]:
def add(x,y):
    import numpy as np
    return np.add(x, y).tolist()

client.deploy('add', add, 'Adds two numbers x and y')

In [126]:
from tabpy_client.schema import generate_schema

schema = generate_schema(
  input={'x': 3, 'y': 2},
  output=5,
  input_description={'x': 'first value',
                     'y': 'second value'},
  output_description='the sum of x and y')

client.deploy('add', add, 'Adds two numbers x and y', schema=schema, override = True)

In [127]:
client.query('add',10,20)

{'model': 'add',
 'response': 30,
 'uuid': 'd2b8fd14-4bc3-4f41-828f-c12588396131',
 'version': 2}