In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore',FutureWarning)

In [3]:
crimes = pd.read_csv("cleaned_crime.csv")

In [4]:
crimes.shape

(2003836, 25)

In [5]:
crimes.columns

Index(['Unnamed: 0', 'ID', 'Case Number', 'DateTime', 'Block', 'IUCR',
       'Primary Type', 'Description', 'Location Description', 'Arrest',
       'Domestic', 'Beat', 'District', 'Ward', 'Community Area', 'FBI Code',
       'X Coordinate', 'Y Coordinate', 'Year', 'Updated On', 'Latitude',
       'Longitude', 'Location', 'Date', 'Time'],
      dtype='object')

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [7]:
crimes["DateTime"]=pd.to_datetime(crimes["DateTime"], format="%Y-%m-%d %H:%M:%S")

In [8]:
crimes["Hour"]=crimes["DateTime"].dt.hour

In [9]:
crimes["Date"]=crimes["DateTime"].dt.dayofyear

In [10]:
crimes["Location Description"].str.strip()

0                                  STREET
1                                  STREET
2                               APARTMENT
3                               APARTMENT
4                                SIDEWALK
5                               APARTMENT
6                               APARTMENT
7                               APARTMENT
8                                  STREET
9                 RESIDENCE PORCH/HALLWAY
10         PARKING LOT/GARAGE(NON.RESID.)
11            CTA GARAGE / OTHER PROPERTY
12                 DRIVEWAY - RESIDENTIAL
13                               SIDEWALK
14                            GAS STATION
15                              APARTMENT
16                                  OTHER
17                     GROCERY FOOD STORE
18                                  ALLEY
19                          BAR OR TAVERN
20                     GROCERY FOOD STORE
21                                  OTHER
22                                  OTHER
23                              AP

In [11]:
crimes["Location Description"].value_counts().head(80)#.index
location_list = crimes["Location Description"].value_counts().head(80).index

In [12]:
crimes = crimes[crimes["Location Description"].isin(location_list)]

In [13]:
crimes.shape

(1997697, 26)

In [23]:
len(crimes["Beat"].unique())

304

In [18]:
crimes["Beat"].max()

2535

In [22]:
sample = crimes.sample(frac=0.4)

del crimes

In [13]:
len(crimes["Primary Type"].value_counts())

35

In [23]:
X = sample[["Hour","Location Description","Beat","Date","Primary Type"]]
y = sample["Arrest"]

In [24]:
sample["Arrest"].value_counts()

False    1932320
True      744970
Name: Arrest, dtype: int64

In [25]:
X = pd.get_dummies(X,prefix=["Location Description","Primary Type"])

In [26]:
# onehot = OneHotEncoder()
# X["Location Description"]=onehot.fit_transform(X["Location Description"].fillna('0'))
# X["Primary Type"]=onehot.fit_transform(X["Primary Type"])

In [27]:
X_train, X_test,y_train,y_test = train_test_split(
    X,y, random_state=42, stratify=y)
X_scaler = StandardScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)






### Logistic Regression

In [28]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

lg.fit(X_train_scaled, y_train)

score1 = lg.score(X_train_scaled, y_train)
score2 = lg.score(X_test_scaled, y_test)

In [29]:
print(f"{score1}\n{score2}")

0.8659071588327896
0.8658689451878988


In [51]:
import pickle

pickle.dump(lg, open())

### Deep Learning

In [43]:
label_encoder=LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)

In [54]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(units=12, activation='relu', input_dim=117))
model.add(Dense(units=20, activation='relu'))
model.add(Dense(units=2, activation='softmax'))

In [55]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [56]:
from keras.callbacks import EarlyStopping

model.fit(
    X_train_scaled,
    y_train,
    epochs=50,
    shuffle=True,
    verbose=2,
    callbacks=[EarlyStopping(monitor='acc', patience=10, verbose=2)]
)

Epoch 1/50
 - 59s - loss: 0.3246 - acc: 0.8737
Epoch 2/50
 - 59s - loss: 0.3208 - acc: 0.8753
Epoch 3/50
 - 59s - loss: 0.3203 - acc: 0.8754
Epoch 4/50
 - 59s - loss: 0.3200 - acc: 0.8755
Epoch 5/50
 - 61s - loss: 0.3199 - acc: 0.8755
Epoch 6/50
 - 61s - loss: 0.3197 - acc: 0.8755
Epoch 7/50
 - 61s - loss: 0.3195 - acc: 0.8756
Epoch 8/50
 - 61s - loss: 0.3195 - acc: 0.8756
Epoch 9/50
 - 62s - loss: 0.3194 - acc: 0.8755
Epoch 10/50
 - 61s - loss: 0.3194 - acc: 0.8756
Epoch 11/50
 - 61s - loss: 0.3193 - acc: 0.8756
Epoch 12/50
 - 61s - loss: 0.3193 - acc: 0.8756
Epoch 13/50
 - 61s - loss: 0.3194 - acc: 0.8756
Epoch 14/50
 - 64s - loss: 0.3194 - acc: 0.8756
Epoch 15/50
 - 63s - loss: 0.3194 - acc: 0.8756
Epoch 16/50
 - 74s - loss: 0.3195 - acc: 0.8756
Epoch 17/50
 - 70s - loss: 0.3194 - acc: 0.8757
Epoch 18/50
 - 72s - loss: 0.3194 - acc: 0.8757
Epoch 19/50
 - 73s - loss: 0.3194 - acc: 0.8756
Epoch 20/50
 - 67s - loss: 0.3194 - acc: 0.8757
Epoch 21/50
 - 63s - loss: 0.3194 - acc: 0.8756
E

<keras.callbacks.History at 0x20236b076a0>

In [58]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

Normal Neural Network - Loss: 0.3201186702995422, Accuracy: 0.8758267682422962


In [60]:
model.save("Cha_Crime_Onehot.h5")