## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# read in the cleaned csv file from online site (data stored on private server to provide stable static hosting)
# df = pd.read_csv("http://www.andrewlane.us/data/crime_data2020-2024.csv") # File is 268MB, allow time for download
df = pd.read_csv("resources/cleaned_data/crime_2020.csv") # use this local file for testing or if no internet available
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON,crime_timestamp,Year
0,190326475,03/01/2020 12:00:00 AM,2020-03-01,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506,2020-03-01 21:30:00,2020
1,200106753,02/09/2020 12:00:00 AM,2020-02-08,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628,2020-02-08 18:00:00,2020
2,200320258,11/11/2020 12:00:00 AM,2020-11-04,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,480.0,,,,1400 W 37TH ST,,34.021,-118.3002,2020-11-04 17:00:00,2020
3,200907217,05/10/2023 12:00:00 AM,2020-03-10,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387,2020-03-10 20:37:00,2020
4,200412582,09/09/2020 12:00:00 AM,2020-09-09,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,...,510.0,,,,200 E AVENUE 28,,34.082,-118.213,2020-09-09 06:30:00,2020


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199846 entries, 0 to 199845
Data columns (total 30 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   DR_NO            199846 non-null  int64  
 1   Date Rptd        199846 non-null  object 
 2   DATE OCC         199846 non-null  object 
 3   TIME OCC         199846 non-null  int64  
 4   AREA             199846 non-null  int64  
 5   AREA NAME        199846 non-null  object 
 6   Rpt Dist No      199846 non-null  int64  
 7   Part 1-2         199846 non-null  int64  
 8   Crm Cd           199846 non-null  int64  
 9   Crm Cd Desc      199846 non-null  object 
 10  Mocodes          173090 non-null  object 
 11  Vict Age         199846 non-null  int64  
 12  Vict Sex         174360 non-null  object 
 13  Vict Descent     174357 non-null  object 
 14  Premis Cd        199844 non-null  float64
 15  Premis Desc      199777 non-null  object 
 16  Weapon Used Cd   72978 non-null   floa

In [3]:
# Isolate data that would be available durring a live 911 call and Victom Age (Vict Age) to use for training the model
df = df[['TIME OCC', 'LAT', 'LON', 'Vict Sex']]
df.head()

Unnamed: 0,TIME OCC,LAT,LON,Vict Sex
0,2130,34.0375,-118.3506,M
1,1800,34.0444,-118.2628,M
2,1700,34.021,-118.3002,X
3,2037,34.1576,-118.4387,M
4,630,34.082,-118.213,


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199846 entries, 0 to 199845
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   TIME OCC  199846 non-null  int64  
 1   LAT       199846 non-null  float64
 2   LON       199846 non-null  float64
 3   Vict Sex  174360 non-null  object 
dtypes: float64(2), int64(1), object(1)
memory usage: 6.1+ MB


In [5]:
df.nunique()

TIME OCC    1436
LAT         5015
LON         4777
Vict Sex       4
dtype: int64

In [6]:
# Remove rows with Vict Age < 0 (entry errors)
df = df[(df['Vict Sex'] == 'M') | (df['Vict Sex'] == 'F')]

# create column Is Minor to use to train model
df['Is F'] = df['Vict Sex'].apply(lambda x: 1 if x == 'F' else 0)

# drop the Vict Age column to keep it out of the training data
df = df.drop(columns=['Vict Sex'])

In [7]:
df.head()

Unnamed: 0,TIME OCC,LAT,LON,Is F
0,2130,34.0375,-118.3506,0
1,1800,34.0444,-118.2628,0
3,2037,34.1576,-118.4387,0
11,1430,34.0881,-118.1877,0
15,30,34.0467,-118.252,0


In [8]:
df.describe()

Unnamed: 0,TIME OCC,LAT,LON,Is F
count,157726.0,157726.0,157726.0,157726.0
mean,1342.122922,33.937263,-117.873897,0.461141
std,647.52556,2.177579,7.55404,0.498489
min,1.0,0.0,-118.6676,0.0
25%,920.0,34.0126,-118.4351,0.0
50%,1420.0,34.0599,-118.3276,0.0
75%,1855.0,34.1688,-118.2756,1.0
max,2359.0,34.3293,0.0,1.0


In [9]:
# Split our preprocessed data into our features and target arrays
y = df['Is F'].values
X = df.drop('Is F', axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

In [10]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [11]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 4 # neural units tried: 2,4,8,16,32,64,128,256
hidden_nodes_layer2 = 2 # multiple layers attempted
# hidden_nodes_layer3 = 2
# hidden_nodes_layer4 = 2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1,
                          input_dim=number_input_features,
                          activation="relu")
)

# Additional hidden layers
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2,
                             activation="relu"))
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3,
#                              activation="relu"))
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4,
#                              activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1,
                             activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 16        
                                                                 
 dense_1 (Dense)             (None, 2)                 10        
                                                                 
 dense_2 (Dense)             (None, 1)                 3         
                                                                 
Total params: 29 (116.00 Byte)
Trainable params: 29 (116.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [13]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=20) # no improved accuracy after 2 epochs

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1233/1233 - 0s - loss: 0.6883 - accuracy: 0.5451 - 318ms/epoch - 258us/step
Loss: 0.6882994174957275, Accuracy: 0.5450902581214905


In [15]:
# Export our model to HDF5 file
nn.save('victim_is_F.h5')

  saving_api.save_model(
