# Predict criminality in Dutch neighbourhoods

In [171]:
import math
import numpy as np
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

#import scipy.stats as stats
#plt.rc('figure', figsize=(10, 6))
%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [176]:
excel = pd.ExcelFile('Geregistreerde-criminaliteit-per-gemeente-wijk-en-buurt-2010-2015.xlsx')
df1 = excel.parse("Tabel 6", decimal=",")
df1 = df1.loc[5:, ['Unnamed: 1', 'Totaal vermogen, vernieling en geweld']]
df1.columns = ['gm_naam', 't_hevig']
df1.shape

df2=pd.read_excel('kwb-2016.xls', decimal=",")
df2.shape

(15446, 2)

(16194, 54)

<b>Prepare Data </b>

a) Load the datasets

In [177]:
df = df2.merge(df1, on="gm_naam", how="left")
df.head()

Unnamed: 0,gm_naam,recs,a_inw,a_man,a_15_24,a_25_44,a_45_64,a_65_oo,a_gehuwd,a_gesch,...,a_bst_nb,g_pau_hh,g_pau_km,a_m2w,g_afs_gs,a_lan_ha,a_wat_ha,ste_mvs,ste_oad,t_hevig
0,Nederland,Land,16979120,8417135,2084673,4217738,4791629,3085308,6727554,1278530,...,1699000,1.0,241,652545,0.9,3367996,786306,2,1961,.
1,Appingedam,Gemeente,12001,5802,1232,2554,3565,2825,5059,1008,...,795,0.9,223,555,0.9,2378,80,3,1041,31
2,Appingedam,Wijk,12000,5800,1230,2555,3565,2825,5060,1005,...,795,0.9,223,555,0.9,2378,80,3,1041,31
3,Appingedam,Buurt,2335,1090,210,495,680,715,845,245,...,150,0.8,1194,85,0.6,84,5,3,1184,31
4,Appingedam,Buurt,3095,1525,335,620,1025,580,1445,205,...,230,1.1,957,155,1.3,158,5,4,892,31


b) Clean the data

In [178]:
#1. Drop missing values
df.replace('.', np.nan, inplace=True)
df.dropna(how='any', inplace=True)

#2. Select neighborhoods
df = df[df['recs']=='Buurt']

#3. Remove the 'recs' column as it's always "Buurt"
df = df.drop('recs', axis=1)

df.shape

(10713, 54)

c) Engineer some features

In [179]:
#1.Transform all numerical columns into int64 or float64
df = df.apply(pd.to_numeric, errors='ignore')

#2.Transform all demographic columns by dividing them by the total inhabitants (a_inw) of the neighbourhood
#Remove records with zero number of inhabitants
df = df[df['a_inw'] != 0]
  
filter_col = [col for col in df.iloc[0:,3:] if col.startswith('a_')]
df[filter_col] = df[filter_col].div(df['a_inw'], axis=0)
df.head()       

Unnamed: 0,gm_naam,a_inw,a_man,a_15_24,a_25_44,a_45_64,a_65_oo,a_gehuwd,a_gesch,a_verwed,...,a_bst_nb,g_pau_hh,g_pau_km,a_m2w,g_afs_gs,a_lan_ha,a_wat_ha,ste_mvs,ste_oad,t_hevig
3,Appingedam,2335,1090,0.089936,0.211991,0.291221,0.30621,0.361884,0.104925,0.130621,...,0.06424,0.8,1194,0.036403,0.6,0.035974,0.002141,3,1184,31.0
4,Appingedam,3095,1525,0.108239,0.200323,0.331179,0.187399,0.466882,0.066236,0.038772,...,0.074313,1.1,957,0.050081,1.3,0.05105,0.001616,4,892,31.0
5,Appingedam,5980,2870,0.103679,0.221572,0.273411,0.241639,0.414716,0.088629,0.076087,...,0.05602,0.9,872,0.044314,0.6,0.047492,0.001839,3,1108,31.0
6,Appingedam,330,175,0.106061,0.242424,0.363636,0.090909,0.454545,0.030303,0.015152,...,0.151515,1.6,35,0.106061,1.8,1.636364,0.054545,5,352,31.0
8,Appingedam,160,85,0.09375,0.125,0.34375,0.1875,0.5,0.0625,0.0,...,0.0625,0.9,7,0.03125,2.9,4.725,0.08125,5,160,31.0


d) Convert the dependent variable 't_hevig' to a categorical variable

In [180]:
df['t_hevig_niv'] = pd.qcut(df['t_hevig'], 3, labels=["LOW", "MEDIUM", "HIGH"])
df[['gm_naam', 't_hevig', 't_hevig_niv']].head()

Unnamed: 0,gm_naam,t_hevig,t_hevig_niv
3,Appingedam,31.0,MEDIUM
4,Appingedam,31.0,MEDIUM
5,Appingedam,31.0,MEDIUM
6,Appingedam,31.0,MEDIUM
8,Appingedam,31.0,MEDIUM


e) Create train-test split [the split is 0.25/0.75 by default]

In [181]:
#1. Extract Y
y = df['t_hevig_niv']
y = np.array(y)

#2. Extract X
X = df.iloc[:,1:-2]
X = np.array(X)

In [182]:
#3. Create train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)
X_train.shape
X_test.shape

(8034, 52)

(2679, 52)

f) Scale the data sets

In [183]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<b>Build the Model</b>

<b>A.</b> Estimate a KNN model, with 5-fold cross validation, with grid search over the number of neighbours parameter

In [184]:
from sklearn.neighbors import KNeighborsClassifier

param_grid = {'n_neighbors': np.arange(2, 10)}
knn_grid = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5).fit(X_train_scaled, y_train)

print ("Best performance: {: .3f}".format(knn_grid.best_score_))
print ("Best param: ", knn_grid.best_params_)

Best performance:  0.577
Best param:  {'n_neighbors': 8}


<b>B.</b> Estimate a Logistic Regression model, with 5-fold cross validation, with grid search over the regularization parameter

In [189]:
#Logistic regression model on the training set, with 5-fold cross validation, with grid search over the regularization parameter
from sklearn.linear_model import LogisticRegression

param_grid = {'C': np.power(10.0, np.arange(-5, 5))}
lr_grid = GridSearchCV(LogisticRegression(dual=False), param_grid, return_train_score = True, cv=5).fit(X_train_scaled, y_train)

print ("Best performance: {: .3f}".format(lr_grid.best_score_))
print ("Best param: ", lr_grid.best_params_)

Best performance:  0.607
Best param:  {'C': 0.1}


<u>Results:</u> <br> 
With L2 regularization (LogisticRegression()) - sligthly faster <br> 
Best performance:  0.607 <br> 
Best param:  {'C': 0.1} <br> 
    
With L1 regularization (LogisticRegression(penalty='l1')) - slower <br> 
Best performance:  0.607 <br> 
Best param:  {'C': 1.0} <br> 

<b>C.</b> Estimate a SVM model, with 5-fold cross validation, with grid search over the regularization parameter

In [197]:
# With Standard scaling 
from sklearn import svm

param_grid = {'C': np.power(10.0, np.arange(-5, 5))}
svc_grid = GridSearchCV(svm.LinearSVC(dual=False), param_grid, return_train_score = True, cv=5).fit(X_train_scaled, y_train)

print ("Best performance: {: .3f}".format(svc_grid.best_score_))
print ("Best param: ", svc_grid.best_params_)

Best performance:  0.603
Best param:  {'C': 0.1}


In [191]:
# code A) with MinMax scaling 
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

param_grid = {'C': np.power(10.0, np.arange(-5, 5))}
estimators = [('scaler', MinMaxScaler()), ('grid', GridSearchCV(svm.LinearSVC(dual=False), param_grid, cv=5))]
pipe = Pipeline(estimators).fit(X_train, y_train)

svc_minmax_grid = pipe.named_steps['grid']
print ("Best performance: {: .3f}".format(svc_minmax_grid.best_score_))
print ("Best param: ", svc_minmax_grid.best_params_)

Best performance:  0.604
Best param:  {'C': 1.0}


In [192]:
# code B) with Robust scaling 
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

param_grid = {'C': np.power(10.0, np.arange(-5, 5))}
estimators = [('scaler', RobustScaler()), ('grid', GridSearchCV(svm.LinearSVC(dual=False), param_grid, cv=5))]
pipe = Pipeline(estimators).fit(X_train, y_train)

svc_rob_grid = pipe.named_steps['grid']
print ("Best performance: {: .3f}".format(svc_rob_grid.best_score_))
print ("Best param: ", svc_rob_grid.best_params_)

Best performance:  0.604
Best param:  {'C': 1.0}


<u>Results:</u> <br> 
With Standard scaling<br> 
Best performance:  0.604 <br> 
Best param:  {'C': 1.0} <br> 
    
With MinMax scaling <br> 
Best performance:  0.604  <br> 
Best param:  {'C': 1.0} <br> 

With Robust scaling <br> 
Best performance:  0.604 <br> 
Best param:  {'C': 1.0} <br> 

<b>D.</b> Estimate a Random Forest model, with 5-fold cross validation, with grid search over the number of estimators, number of eatures and each tree depth parameters

In [194]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {'max_depth': np.arange(1, 8), 'n_estimators': np.arange(100, 150, 10), 'max_features': np.arange(3, 10)} 
tree_grid = GridSearchCV(RandomForestClassifier(n_jobs=-1, random_state=10), param_grid, cv=5).fit(X_train, y_train)
print ("Best performance: {: .3f}".format(tree_grid.best_score_))
print ("Best param: ", tree_grid.best_params_)

Best performance:  0.609
Best param:  {'max_depth': 7, 'max_features': 8, 'n_estimators': 110}


<u>Results:</u> <br> 
RandomForestClassifier <br>
Best performance:  0.609 <br>
Best param:  {'max_depth': 7, 'max_features': 8, 'n_estimators': 110} 

In [195]:
tree = RandomForestClassifier(n_estimators = 110, max_depth=7, max_features = 8, n_jobs=-1, random_state=10)
tree.fit(X_train, y_train)
scores = cross_val_score(tree, X_train, y_train)
print ("Best performance: {: .3f}".format(scores.mean()))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=8, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=110, n_jobs=-1,
            oob_score=False, random_state=10, verbose=0, warm_start=False)

Best performance:  0.601


<b>Estimate the Model performance</b>

In [196]:
#A. KNN
print ("KNN performance: {: .3f}".format(knn_grid.score(X_test_scaled, y_test)))

#B. LogisticRegression with L2 regularization
print ("LogisticRegression with L2 regularization performance: {: .3f}".format(lr_grid.score(X_test_scaled, y_test)))

#C. SVC
print ("SVC performance: {: .3f}".format(svc_grid.score(X_test_scaled, y_test)))

#C_A). SVC with MinMax scaling 
print ("SVC with MinMax scaling performance: {: .3f}".format(svc_minmax_grid.score(X_test_scaled, y_test)))

#C_B). SVC with Robust scaling 
print ("SVC with Robust scaling performance: {: .3f}".format(svc_rob_grid.score(X_test_scaled, y_test)))

#D. RandomForest
print ("RandomForest performance: {: .3f}".format(tree.score(X_test, y_test)))


KNN performance:  0.592
LogisticRegression with L2 regularization performance:  0.601
SVC performance:  0.601
SVC with MinMax scaling performance:  0.551
SVC with Robust scaling performance:  0.548
RandomForest performance:  0.611


In conclusion, the best model is the RandomForest model, with an accurancy of 0.611 on the test dataset.