## Preprocessing

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

# Import pandas and read the charity_data.csv from the provided cloud URL.
import pandas as pd
df = pd.read_csv("resources/cleaned_data/crime_2020.csv")
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON,crime_timestamp,Year
0,190326475,03/01/2020 12:00:00 AM,2020-03-01,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506,2020-03-01 21:30:00,2020
1,200106753,02/09/2020 12:00:00 AM,2020-02-08,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628,2020-02-08 18:00:00,2020
2,200320258,11/11/2020 12:00:00 AM,2020-11-04,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,480.0,,,,1400 W 37TH ST,,34.021,-118.3002,2020-11-04 17:00:00,2020
3,200907217,05/10/2023 12:00:00 AM,2020-03-10,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387,2020-03-10 20:37:00,2020
4,200412582,09/09/2020 12:00:00 AM,2020-09-09,630,4,Hollenbeck,413,1,510,VEHICLE - STOLEN,...,510.0,,,,200 E AVENUE 28,,34.082,-118.213,2020-09-09 06:30:00,2020


In [2]:
df = df.drop(columns=['DR_NO', 
                      'Date Rptd',
                      'DATE OCC',
                      'AREA NAME', 
                      'Crm Cd Desc', 
                      'Mocodes',
                      'Crm Cd 1', 
                      'Crm Cd 2', 
                      'Crm Cd 3', 
                      'Crm Cd 4', 
                      'LOCATION', 
                      'Cross Street', 
                      'Premis Desc', 
                      'Weapon Desc', 
                      'Status Desc',
                      'crime_timestamp',
                      'LAT', 
                      'LON'])

In [3]:
# Determine the number of unique values in each column.
df.nunique()

TIME OCC          1436
AREA                21
Rpt Dist No       1155
Part 1-2             2
Crm Cd             129
Vict Age           104
Vict Sex             4
Vict Descent        19
Premis Cd          300
Weapon Used Cd      77
Status               5
Year                 1
dtype: int64

In [4]:
df = df[df['Vict Age'] > 0]

In [5]:
df['Is Minor'] = df['Vict Age'].apply(lambda x: 1 if x < 18 else 0)

In [6]:
df.head()

Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Vict Sex,Vict Descent,Premis Cd,Weapon Used Cd,Status,Year,Is Minor
1,1800,1,182,1,330,47,M,O,128.0,,IC,2020,0
2,1700,3,356,1,480,19,X,X,502.0,,IC,2020,0
3,2037,9,964,1,343,19,M,O,405.0,,IC,2020,0
11,1430,4,407,1,310,27,M,W,221.0,,IC,2020,0
19,1615,6,646,2,805,23,F,H,101.0,,AA,2020,0


In [7]:
# Convert categorical data to numeric with `pd.get_dummies`
df_dummies = pd.get_dummies(df, columns=['Vict Sex', 
                                         'Vict Descent',
                                         'Status'])
df_dummies.head()

Unnamed: 0,TIME OCC,AREA,Rpt Dist No,Part 1-2,Crm Cd,Vict Age,Premis Cd,Weapon Used Cd,Year,Is Minor,...,Vict Descent_U,Vict Descent_V,Vict Descent_W,Vict Descent_X,Vict Descent_Z,Status_AA,Status_AO,Status_IC,Status_JA,Status_JO
1,1800,1,182,1,330,47,128.0,,2020,0,...,False,False,False,False,False,False,False,True,False,False
2,1700,3,356,1,480,19,502.0,,2020,0,...,False,False,False,True,False,False,False,True,False,False
3,2037,9,964,1,343,19,405.0,,2020,0,...,False,False,False,False,False,False,False,True,False,False
11,1430,4,407,1,310,27,221.0,,2020,0,...,False,False,True,False,False,False,False,True,False,False
19,1615,6,646,2,805,23,101.0,,2020,0,...,False,False,False,False,False,True,False,False,False,False


In [8]:
# Split our preprocessed data into our features and target arrays
y = df_dummies['Is Minor'].values
X = df_dummies.drop('Is Minor', axis=1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=8)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()
# scaler = RobustScaler()
# scaler = MinMaxScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 4
hidden_nodes_layer3 = 8
hidden_nodes_layer4 = 2


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, 
                          input_dim=number_input_features, 
                          activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, 
                             activation="relu"))
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, 
#                              activation="relu"))
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, 
#                              activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1,
                             activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1)                 38        
                                                                 
 dense_1 (Dense)             (None, 1)                 2         
                                                                 
Total params: 40 (160.00 Byte)
Trainable params: 40 (160.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [12]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1184/1184 - 0s - loss: nan - accuracy: 0.9645 - 277ms/epoch - 234us/step
Loss: nan, Accuracy: 0.9645468592643738


In [14]:
# Export our model to HDF5 file
nn.save('victim_is_minor.h5')

  saving_api.save_model(
