<H4 align='left'>12/10/2017</H4> <H4 align='right' >E-63 Big Data Analytics</H4>



<H2 align='center'>Crime Scene Rochester NY </H2>
<H4 align='center'>A Big Data Case Study in Social Sciences</H4>
<H4 align='right'>By : Ajay Antony Kliyara Philip</H4>

<H2 align='center'> Part 3 : Predicting Crime using SKLearn </H2>


In [1]:
#reference :  https://elitedatascience.com/python-machine-learning-tutorial-scikit-learn
#importing required packages
import pandas as pd
import numpy as np

#sklearn module for train-test split of data
from sklearn.model_selection import train_test_split

#sklearn module for normalizing data
from sklearn import preprocessing

#sklearn module for making ML pipeline
from sklearn.pipeline import make_pipeline

#sklearn module for hyper parameter tuning
from sklearn.model_selection import GridSearchCV

#sklearn module for determining accuracy of classifiers
from sklearn.metrics import accuracy_score

#sklearn module for balancing data
from imblearn.under_sampling import RandomUnderSampler

#sklearn module classifier algorithms
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

#module to save models to be retreived later
from sklearn.externals import joblib

#importing tabpy client
import tabpy_client

In [4]:
#reading data from csv
crime_raw_data = pd.read_csv('data/rpd_crime_2011_to_present.csv')
crime_raw_data.shape

(76228, 40)

In [5]:
#converting city names to upper
crime_raw_data.loc[:,['Address_City']] = crime_raw_data.Address_City.apply(str.upper)

In [6]:
#only getting rochester data
roc_crime_data = crime_raw_data.loc[crime_raw_data.Address_City=='ROCHESTER',:]
#roc_crime_data = crime_raw_data.loc[crime_raw_data.Statute_Text!='Larceny',:]
roc_crime_data.shape

(75468, 40)

In [7]:
#remove any rows will null records
roc_crime_data = roc_crime_data.dropna()

In [8]:
roc_crime_data.columns

Index(['X', 'Y', 'OBJECTID', 'Geocode_Address', 'Geocode_Street',
       'Case_Number', 'OccurredFrom_Date_Year', 'OccurredFrom_Date_Month',
       'OccurredFrom_Time', 'OccurredFrom_Timestamp',
       'OccurredThrough_Date_Year', 'OccurredThrough_Date_Month',
       'OccurredThrough_Time', 'OccurredThrough_Timestamp',
       'Reported_Date_Year', 'Reported_Date_Month', 'Reported_Time',
       'Reported_Timestamp', 'Address_StreetFull', 'Address_City',
       'Address_State', 'Patrol_Beat', 'Patrol_Section', 'Case_Status',
       'Statute_Title', 'Statute_Section', 'Statute_Subsection',
       'Statute_Degree', 'Statute_Class', 'Statute_Category', 'Statute_Text',
       'Statute_Description', 'Statute_CrimeCategory', 'Statute_Attempted',
       'Weapon_Description', 'Larceny_Type', 'Location_Type', 'Geo_Beat',
       'Geo_Section', 'Geo_Section_Num'],
      dtype='object')

In [9]:
#seprating into features and labels
roc_crime_data_features = roc_crime_data.loc[:,['X','Y','OccurredFrom_Date_Month','OccurredFrom_Time']]
label = roc_crime_data.loc[:,['Statute_Text']]

In [9]:
label.Statute_Text.value_counts()

Larceny                       44441
Burglary                      14827
Aggravated Assault             6350
Robbery                        5106
Motor Vehicle Theft            4149
Murder                          222
Non-Negligent Manslaughter       12
Name: Statute_Text, dtype: int64

In [10]:
#http://contrib.scikit-learn.org/imbalanced-learn/stable/auto_examples/under-sampling/plot_random_under_sampler.html

# Apply the random under-sampling, reducing larceny to 20000
rus = RandomUnderSampler(ratio={'Larceny' : 20000},return_indices=True)
X_resampled, y_resampled, idx_resampled = rus.fit_sample(roc_crime_data_features, label.Statute_Text.values)

In [11]:
pd.Series(y_resampled).value_counts()

Larceny                       20000
Burglary                      14827
Aggravated Assault             6350
Robbery                        5106
Motor Vehicle Theft            4149
Murder                          222
Non-Negligent Manslaughter       12
dtype: int64

In [13]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(roc_crime_data_features, label, 
                                                    test_size=0.2, 
                                                    random_state=123, 
                                                    stratify=label)

In [15]:
#Baseline model
#Since larceny is the highest crime type, check the accuracy by predicting all events as 'Larceny'

print(np.mean(y_test.Statute_Text == 'Larceny'))
np.mean(y_resampled == 'Larceny')

0.591732126215


0.3947420360794221

In [16]:
X_train.shape

(60085, 4)

In [17]:
X_test.shape

(15022, 4)

<h3>Decision Tree Classifier</h3>

In [18]:
#creating pipeline for decision tree
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         DecisionTreeClassifier(random_state=0))

In [19]:
#need to add more
hyperparameters = { 'decisiontreeclassifier__criterion' : ['gini', 'entropy'],
                  'decisiontreeclassifier__max_depth': [None, 100, 50, 10],
                  'decisiontreeclassifier__splitter': ['best','random']}



In [20]:
#add time
def gridSearch(pipeline,hyperparameters):
    clft = GridSearchCV(pipeline, hyperparameters, cv=10)
    # Fit and tune model
    #clft.fit(X_train, y_train.Statute_Text.values)
    clft.fit(X_resampled, y_resampled)
    return clft

In [21]:
#grid searching through parameters 
clf = gridSearch(pipeline,hyperparameters)

In [22]:
#best classifier
print(clf.best_params_)

{'decisiontreeclassifier__criterion': 'gini', 'decisiontreeclassifier__max_depth': 10, 'decisiontreeclassifier__splitter': 'best'}


In [23]:
#checking if data is refit on entire training set
print(clf.refit)

True


In [24]:
#predicting on test set
y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)

In [25]:
np.unique(y_pred)

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Robbery'], dtype=object)

In [26]:
#distribution in test set
pd.Series(y_pred).value_counts()

Larceny                7908
Burglary               5441
Aggravated Assault     1391
Robbery                 192
Motor Vehicle Theft      89
Murder                    1
dtype: int64

In [27]:
#checking accuracy on test set
accuracy_score(y_test, y_pred)

0.54393556117694053

<b>Decision Tree accuracy is 54%</b>

In [28]:
pd.DataFrame(y_pred_prob).head()


Unnamed: 0,0,1,2,3,4,5,6
0,0.06812,0.168937,0.632153,0.046322,0.0,0.0,0.084469
1,0.038462,0.476923,0.346154,0.115385,0.0,0.007692,0.015385
2,0.208431,0.30445,0.238876,0.114754,0.014052,0.0,0.119438
3,0.0,0.012658,0.987342,0.0,0.0,0.0,0.0
4,0.051655,0.277834,0.556169,0.054162,0.0,0.0,0.060181


In [45]:
clf.best_estimator_.classes_

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Non-Negligent Manslaughter', 'Robbery'], dtype=object)

<h3>Random Forest Classifier<h3>

In [31]:
#pipeline for random forests
pipeline2 = make_pipeline(preprocessing.StandardScaler(), 
                         RandomForestClassifier(max_features=None))

In [32]:
#hyper parameter tuning
hyperparameters2 = { 'randomforestclassifier__criterion' : ['gini', 'entropy'],
                  'randomforestclassifier__max_depth': [None, 100, 50, 10, 5]}


In [33]:
#grid searching parameters
clf2 = gridSearch(pipeline2,hyperparameters2)

In [34]:
#displaying best classifier parameters 
print(clf2.best_params_)
print(clf2.refit)


{'randomforestclassifier__criterion': 'gini', 'randomforestclassifier__max_depth': 10}
True


In [35]:
#predicting on test set
y_pred = clf2.predict(X_test)
y_pred_prob = clf2.predict_proba(X_test)

In [36]:
#distribution across crime types 
pd.Series(y_pred).value_counts()

Larceny                       8676
Burglary                      5124
Aggravated Assault             979
Robbery                        188
Motor Vehicle Theft             54
Non-Negligent Manslaughter       1
dtype: int64

In [37]:
#accuracy on test set
accuracy_score(y_test, y_pred)

0.59013446944481429

<b>Accuracy of Random forest classifier 59%</b>

<h3>Adaboost Classifier</h3>

In [40]:
#pipeline for adaboost
pipeline3 = make_pipeline(preprocessing.StandardScaler(), 
                         AdaBoostClassifier())

In [41]:
#hyper parameter setting
hyperparameters3 = {  'adaboostclassifier__algorithm' : ['SAMME', 'SAMME.R'],
                  'adaboostclassifier__n_estimators': [100, 50, 35, 25, 10]}


In [42]:
#grid searching through parameters
clf3 = gridSearch(pipeline3,hyperparameters3)

In [43]:
#displaying best parameters
print(clf3.best_params_)
print(clf3.refit)

{'adaboostclassifier__algorithm': 'SAMME.R', 'adaboostclassifier__n_estimators': 35}
True


In [44]:
#predicting for test dataset
y_pred = clf3.predict(X_test)
y_pred_prob = clf3.predict_proba(X_test)

In [45]:
#distribution of prediction labels
pd.Series(y_pred).value_counts()

Larceny                       11769
Burglary                       2964
Aggravated Assault              270
Non-Negligent Manslaughter       19
dtype: int64

In [46]:
#accuracy score on test set
accuracy_score(y_test, y_pred)

0.55292238050858744

<b>Adabost accuracy was found to be 55%</b>

<h3>K Nearest Neighbors Classifier<h3>

In [47]:
#pipeline for nearest neighbors
pipeline4 = make_pipeline(preprocessing.StandardScaler(), 
                         KNeighborsClassifier())

In [48]:
#hyper parameter setting
hyperparameters4 = {  'kneighborsclassifier__n_neighbors' : [4,5,6,7],
                  'kneighborsclassifier__weights': ['distance'],
                   'kneighborsclassifier__algorithm': ['auto']}


In [49]:
#grid search parameters
clf4 = gridSearch(pipeline4,hyperparameters4)

In [51]:
#displaying best parameter
print(clf4.best_params_)
print(clf4.refit)

{'kneighborsclassifier__n_neighbors': 7, 'kneighborsclassifier__algorithm': 'auto', 'kneighborsclassifier__weights': 'distance'}
True


In [52]:
#predicting on test set
y_pred = clf4.predict(X_test)
y_pred_prob = clf4.predict_proba(X_test)

In [53]:
pd.Series(y_pred).value_counts()

Larceny                       7011
Burglary                      4168
Aggravated Assault            1593
Robbery                       1234
Motor Vehicle Theft            965
Murder                          49
Non-Negligent Manslaughter       2
dtype: int64

In [54]:
#test set accuracy
accuracy_score(y_test, y_pred)

0.87371854613233924

<b>Accuracy of K Nearest Neighbors was found to be 86% </b>

<h3>Saving the Model</h3>

In [55]:
#saving the model
joblib.dump(clf4, 'trained_model/knn_classifier.pkl')

['trained_model/knn_classifier.pkl']

In [56]:
#reloading and test
clf5 = joblib.load('trained_model/knn_classifier.pkl')
y_pred = clf5.predict(X_test)
y_pred_prob = clf5.predict_proba(X_test)
accuracy_score(y_test, y_pred)

0.87371854613233924

<h3> Simulation of User using the model<h3>

In [57]:
import geocoder
#method to get address from user , month of year and time of day and predicting using the model 
def get_data():
    input_add = input("Enter address, comma separated")
    lat_long = geocoder.google(input_add).latlng
    print(lat_long)
    X = float(lat_long[1])
    Y = float(lat_long[0])
    month = int(input("Enter Month of Year : 1-12"))
    time_24hr = int(input("Enter time : 24hr clock"))
    
    input_data = {'X' : X, 'Y' : Y, 'OccurredFrom_Date_Month' : month, 'OccurredFrom_Time' : time_24hr}
    l = clf4.predict_proba(pd.DataFrame([input_data]))[0].tolist()
    z = zip(clf4.classes_.tolist(),l)
    for i in z:
        print(i)
    return z

    

In [58]:
clf4.classes_

array(['Aggravated Assault', 'Burglary', 'Larceny', 'Motor Vehicle Theft',
       'Murder', 'Non-Negligent Manslaughter', 'Robbery'], dtype=object)

In [59]:
#sample address - 100,clinton ave south, rochester, 14604
output_data = get_data()

Enter address, comma separated100,clinton ave south, rochester, 14604
[43.1545777, -77.6046532]
Enter Month of Year : 1-126
Enter time : 24hr clock12
('Aggravated Assault', 0.0)
('Burglary', 0.5714300787958991)
('Larceny', 0.28571383414138063)
('Motor Vehicle Theft', 0.1428560870627203)
('Murder', 0.0)
('Non-Negligent Manslaughter', 0.0)
('Robbery', 0.0)


In [60]:
#defining function to work with tableau, it gets input from tableau
def crimeKnnClassifier(address,month,time_24hr):
    #load saved model
    clf = joblib.load('/Users/ajaykliyara/Documents/cs63/final_project/trained_model/knn_classifier.pkl')
    #get lat long for address
    lat_long = geocoder.google(address).latlng
    #print(clf)
    #print(lat_long)
    X = float(lat_long[1])
    Y = float(lat_long[0])
    #get month and time of day from user
    input_data = {'X' : X, 'Y' : Y, 'OccurredFrom_Date_Month' : month, 'OccurredFrom_Time' : time_24hr}
    input_data = pd.DataFrame([input_data])
    input_data = input_data.loc[:,['X','Y','OccurredFrom_Date_Month','OccurredFrom_Time']]
    #predict probability of crime
    l = clf.predict_proba(input_data)[0].tolist()
    z = zip(clf.classes_.tolist(),l)
    #print(z)
    lst=[]
    for i in z:
        lst.append(i[1])
    
    #return list of probability of each crime type
    return lst

In [70]:
result = crimeKnnClassifier('100,Clinton Ave S,Rochester, 14604',6,200)
print(result)
print(clf.classes_)

[0.2556242288638181, 0.0, 0.34688720144063795, 0.0, 0.0, 0.0, 0.3974885696955439]
['Aggravated Assault' 'Burglary' 'Larceny' 'Motor Vehicle Theft' 'Murder'
 'Non-Negligent Manslaughter' 'Robbery']


In [62]:
result = crimeKnnClassifier('798,Jay St,Rochester,NY, 14611',2,1815)
result

[0.0,
 0.4931111765819034,
 0.38467656324038796,
 0.0697557636156548,
 0.0,
 0.0,
 0.05245649656205392]

<h3>Deploying to TabPy Server</h3>

In [65]:
#reference : https://github.com/tableau/TabPy/blob/master/client.md
#connecting to tabpy client
client = tabpy_client.Client('http://localhost:9004/')

In [63]:
#generate schema - defined input and output of model
from tabpy_client.schema import generate_schema
schema = generate_schema(
  input={'address': '100,Clinton Ave S, Rochester, NY, 14604',
         'month': 6,
         'time_24hr' : 1400},
  output=[0.0,0.2,0.4,0.2,0.2],
  input_description={'address': 'address separated by comma',
                     'month': 'month of the year',
                     'time_24hr' : 'time of the day'},
  output_description='probability of a crime type at the given location, month and time')

In [66]:
#deploying model to tabpy server
client.deploy('crimeKnnClassifier', crimeKnnClassifier, 
              'Returns probabilites of Crime time for given address, month and time of day', schema = schema, 
              override=True)

In [67]:
#testing if model deployed on tabpy server can be queried
client.query('crimeKnnClassifier','100,Clinton Ave S, Rochester, NY, 14604', 6, 1400)

{'model': 'crimeKnnClassifier',
 'response': [0.0,
  0.041860260453154184,
  0.9581397395468458,
  0.0,
  0.0,
  0.0,
  0.0],
 'uuid': 'e4f03052-e144-455f-9177-35bd7795179b',
 'version': 16}