In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore',FutureWarning)

In [2]:
crimes = pd.read_csv("cleaned_crime.csv")

In [3]:
crimes.shape

(2003836, 25)

In [4]:
crimes.columns

Index(['Unnamed: 0', 'ID', 'Case Number', 'DateTime', 'Block', 'IUCR',
       'Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code',
       'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude',
       'Longitude', 'Location', 'Date', 'Time'],
      dtype='object')

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [6]:
crimes["DateTime"]=pd.to_datetime(crimes["DateTime"], format="%Y-%m-%d %H:%M:%S")

In [7]:
crimes["Hour"]=crimes["DateTime"].dt.hour

In [8]:
crimes["Date"]=crimes["DateTime"].dt.dayofyear

In [9]:
crimes["Location Description"].str.strip()

0                                  STREET
1                                  STREET
2                               APARTMENT
3                               APARTMENT
4                                SIDEWALK
5                               APARTMENT
6                               APARTMENT
7                               APARTMENT
8                                  STREET
9                 RESIDENCE PORCH/HALLWAY
10         PARKING LOT/GARAGE(NON.RESID.)
11            CTA GARAGE / OTHER PROPERTY
12                 DRIVEWAY - RESIDENTIAL
13                               SIDEWALK
14                            GAS STATION
15                              APARTMENT
16                                  OTHER
17                     GROCERY FOOD STORE
18                                  ALLEY
19                          BAR OR TAVERN
20                     GROCERY FOOD STORE
21                                  OTHER
22                                  OTHER
23                              AP

In [10]:
crimes["Location Description"].value_counts().head(80)#.index
location_list = crimes["Location Description"].value_counts().head(80).index

In [11]:
crimes = crimes[crimes["Location Description"].isin(location_list)]

In [12]:
crimes.shape

(1997697, 26)

In [13]:
len(crimes["Primary Type"].value_counts())

35

In [14]:
X = crimes[["Hour","Location Description","Beat","Date","Primary Type"]]
y = crimes["Arrest"]

In [16]:
X = pd.get_dummies(X,prefix=["Location Description","Primary Type"])

In [15]:
# onehot = OneHotEncoder()
# X["Location Description"]=onehot.fit_transform(X["Location Description"].fillna('0'))
# X["Primary Type"]=onehot.fit_transform(X["Primary Type"])

In [18]:
X_train, X_test,y_train,y_test = train_test_split(
    X,y, random_state=42, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

label_encoder=LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

### Linear Regression

In [19]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train_scaled, y_train_categorical)

score1 = model.score(X_train_scaled, y_train_categorical)
score2 = model.score(X_test_scaled, y_test_categorical)

In [20]:
print(f"{score1}\n{score2}")

0.46441130196610475
0.4635542565771215


### Deep Learning

In [66]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=12, activation='relu', input_dim=5))
model.add(Dense(units=20, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [67]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2,
    callbacks=[EarlyStopping(monitor='acc', patience=10, verbose=2)]
)

Epoch 1/50
 - 115s - loss: 0.3823 - acc: 0.8453
Epoch 2/50
 - 174s - loss: 0.3534 - acc: 0.8586
Epoch 3/50
 - 187s - loss: 0.3476 - acc: 0.8630
Epoch 4/50
 - 194s - loss: 0.3440 - acc: 0.8653
Epoch 5/50
 - 192s - loss: 0.3421 - acc: 0.8658
Epoch 6/50
 - 190s - loss: 0.3412 - acc: 0.8663
Epoch 7/50
 - 201s - loss: 0.3405 - acc: 0.8666
Epoch 8/50
 - 203s - loss: 0.3401 - acc: 0.8668
Epoch 9/50
 - 199s - loss: 0.3395 - acc: 0.8670
Epoch 10/50
 - 139s - loss: 0.3393 - acc: 0.8671
Epoch 11/50
 - 126s - loss: 0.3391 - acc: 0.8673
Epoch 12/50


In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
model.save("CHA_crime_trained_onehot.h5")