## Preprocessing

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Import dependencies
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [None]:
#  Import and read the crash_data.csv
crash_data_df = pd.read_csv('crash_data_2.csv')
crash_data_df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'crash_data_2.csv'

In [None]:
#  Import and read the new_york_weather.csv
import pandas as pd
weather_data_df = pd.read_csv('new_york_weather.csv')
weather_data_df.head()

In [None]:
# Reformat the crash date and crash time columns
crash_data_df['crash_date'] = pd.to_datetime(crash_data_df['crash_date'])
crash_data_df['crash_date'] = crash_data_df['crash_date'].dt.date

crash_data_df['crash_time'] = pd.to_datetime(crash_data_df['crash_time'], format='%H:%M')
crash_data_df['crash_time'] = crash_data_df['crash_time'].dt.round('h')

crash_data_df['datetime'] = pd.to_datetime(crash_data_df['crash_date'].astype(str) + ' ' + crash_data_df['crash_time'].dt.strftime('%H:%M'))

# Convert the 'datetime' column to Unix timestamp (seconds since the epoch)
crash_data_df['timestamp'] = crash_data_df['datetime'].astype(int) / 10**9
crash_data_df['timestamp'] = crash_data_df['timestamp'].astype(int)

# Drop the intermediate columns
crash_data_df.drop(columns=['crash_date', 'crash_time', 'datetime'], inplace=True)
crash_data_df.head()

In [None]:
# Reformat the weather date and time column
weather_data_df['dt_iso'] = pd.to_datetime(weather_data_df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC')
weather_data_df['dt'] = weather_data_df['dt'].astype(int)
weather_data_df.head()

In [None]:
# Determine the number of unique values in each column of the crash data
crash_data_df.nunique()

In [None]:
# Determine the number of unique values in each column of the weather data
weather_data_df.nunique()

In [None]:
# Merge the dataframes and drop columns that will not be used
merged_df = pd.merge(
    crash_data_df,
    weather_data_df,
    left_on=['timestamp'],
    right_on=['dt'],
    how='left')
merged_df.drop(columns=['latitude', 'longitude', 'number_of_persons_killed',
                        'number_of_pedestrians_injured',
                        'number_of_pedestrians_killed',
                        'number_of_cyclist_injured',
                        'number_of_cyclist_killed',
                        'number_of_motorist_injured',
                        'number_of_motorist_killed',
                        'contributing_factor_vehicle_2',
                        'contributing_factor_vehicle_3',
                        'contributing_factor_vehicle_4',
                        'contributing_factor_vehicle_5',
                        'collision_id',
                        'vehicle_type_code2',
                        'dt', 'dt_iso', 'timezone',
                        'city_name', 'lat', 'lon', 'sea_level', 'grnd_level',
                        'dew_point', 'feels_like', 'temp_min', 'temp_max',
                        'pressure', 'wind_deg', 'wind_gust', 'rain_1h',
                        'rain_3h', 'snow_1h', 'snow_3h', 'clouds_all',
                        'weather_id', 'weather_icon', 'location',
                        'on_street_name', 'off_street_name', 'cross_street_name',
                        'vehicle_type_code_3', 'vehicle_type_code_4',
                        'vehicle_type_code_5'], inplace=True)
merged_df.head()

In [None]:
# Convert categorical data to numeric with `pd.get_dummies`
merged_df = pd.get_dummies(merged_df, columns=['borough', 'zip_code', 'contributing_factor_vehicle_1', 'vehicle_type_code1', 'weather_main', 'weather_description'])
merged_df.head()

In [None]:
# Split the preprocessed data into features and target arrays
y = merged_df["number_of_persons_injured"].values
X = merged_df.drop(columns=["number_of_persons_injured"], axis=1, inplace=True)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(merged_df, y, random_state=78)

In [None]:
# Delete the variables that will not be used to increase RAM capacity
y = None
X = None
merged_df = None
del X
del y
del merged_df

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [None]:
import tensorflow as tf

# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = X_train.shape[1]
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=5)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
nn.save('Optimization.h5')

# Optimize the model

In [None]:
 !pip install keras-tuner

# 1. Get the best model hyperparameters

In [None]:
# Create a method that creates a new Sequential model with hyperparameter options
number_input_features = X_train.shape[1]

def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=number_input_features))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [None]:
# Import the kerastuner library
import keras_tuner as kt

tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [None]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

In [None]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## 2. Add hidden layers

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 5
hidden_nodes_layer3 = 5
hidden_nodes_layer4 = 5
hidden_nodes_layer5 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=5)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

## 3. Add more neurons to hidden layers

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  15
hidden_nodes_layer2 = 15
hidden_nodes_layer3 = 15
hidden_nodes_layer4 = 15
hidden_nodes_layer5 = 15

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=5)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# 4. Add the number of epochs

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  15
hidden_nodes_layer2 = 15
hidden_nodes_layer3 = 15
hidden_nodes_layer4 = 15
hidden_nodes_layer5 = 15

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="relu"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

# 5. Use different activation functions for the hidden layers

In [None]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  15
hidden_nodes_layer2 = 15
hidden_nodes_layer3 = 15
hidden_nodes_layer4 = 15
hidden_nodes_layer5 = 15

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="tanh")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="tanh"))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="tanh"))

# Fourth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer4, activation="tanh"))

# Fifth hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer5, activation="tanh"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Export our model to HDF5 file
nn.save('Optimization_2.h5')