In [None]:
!pip install keras-tuner

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import keras_tuner as kt

In [2]:
# Import and read our merged_data.csv
crash_weather_data = pd.read_csv('resources/merged_crash_weather_data.csv')
crash_weather_data.head()

Unnamed: 0,crash_datetime,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,collision_id,...,temp,visibility,humidity,rain_1h,weather_main,weather_description,lat,lon,distance,crash_occurred
0,2022-01-01 08:00:00,0,0,0,0,0,0,0,0,4491400,...,50.32,10000.0,88,,Clouds,overcast clouds,40.712775,-74.005973,9869.378062,1
1,2022-01-01 03:00:00,0,0,0,0,0,0,0,0,4491586,...,49.75,10000.0,87,,Clouds,overcast clouds,40.712775,-74.005973,7452.073715,1
2,2022-01-01 17:00:00,0,0,0,0,0,0,0,0,4491430,...,53.53,10000.0,91,0.38,Rain,light rain,40.712775,-74.005973,1033.866521,1
3,2022-01-01 00:00:00,0,0,0,0,0,0,0,0,4491535,...,49.91,10000.0,86,,Clouds,overcast clouds,40.712775,-74.005973,5156.983523,1
4,2022-01-01 17:00:00,0,0,0,0,0,0,0,0,4491660,...,53.53,10000.0,91,0.38,Rain,light rain,40.712775,-74.005973,7870.490486,1


In [3]:
# Convert categorical data to numeric with pd.get_dummies and drop time data
crash_weather_data = crash_weather_data.drop(columns=['crash_datetime', 'weather_datetime'])
crash_weather_data = pd.get_dummies(crash_weather_data, columns=['weather_main', 'weather_description'])
crash_weather_data.head()

Unnamed: 0,number_of_persons_injured,number_of_persons_killed,number_of_pedestrians_injured,number_of_pedestrians_killed,number_of_cyclist_injured,number_of_cyclist_killed,number_of_motorist_injured,number_of_motorist_killed,collision_id,latitude,...,weather_description_moderate rain,weather_description_overcast clouds,weather_description_scattered clouds,weather_description_sky is clear,weather_description_snow,weather_description_thunderstorm,weather_description_thunderstorm with heavy rain,weather_description_thunderstorm with light rain,weather_description_thunderstorm with rain,weather_description_very heavy rain
0,0,0,0,0,0,0,0,0,4491400,40.771477,...,False,True,False,False,False,False,False,False,False,False
1,0,0,0,0,0,0,0,0,4491586,40.646034,...,False,True,False,False,False,False,False,False,False,False
2,0,0,0,0,0,0,0,0,4491430,40.71236,...,False,False,False,False,False,False,False,False,False,False
3,0,0,0,0,0,0,0,0,4491535,40.75478,...,False,True,False,False,False,False,False,False,False,False
4,0,0,0,0,0,0,0,0,4491660,40.701195,...,False,False,False,False,False,False,False,False,False,False


In [4]:
# Split the preprocessed data into features and target arrays
y = crash_weather_data['number_of_persons_injured'].values
X = crash_weather_data.drop(columns=['number_of_persons_injured'], axis=1)

# Transform y to a vertical vector
y = y.reshape(-1,1)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [5]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation="tanh", input_dim=45))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation="tanh"))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [11]:
tuner = kt.Hyperband(
    create_model,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2)

In [12]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

Trial 47 Complete [00h 00m 19s]
val_accuracy: 0.5973607897758484

Best val_accuracy So Far: 0.5973607897758484
Total elapsed time: 00h 08m 25s

Search: Running Trial #48

Value             |Best Value So Far |Hyperparameter
9                 |1                 |first_units
4                 |5                 |num_layers
5                 |7                 |units_0
7                 |1                 |units_1
1                 |1                 |units_2
1                 |1                 |units_3
1                 |1                 |units_4
9                 |None              |units_5
20                |3                 |tuner/epochs
7                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
2                 |0                 |tuner/round
0043              |None              |tuner/trial_id

Epoch 8/20
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 689us/step - accuracy: 0.5863 - loss: nan - val_accuracy:

KeyboardInterrupt: 

In [13]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'first_units': 1,
 'num_layers': 5,
 'units_0': 7,
 'tuner/epochs': 3,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 2,
 'tuner/round': 0,
 'units_1': 1,
 'units_2': 1,
 'units_3': 1,
 'units_4': 1}

In [14]:
# Evaluate best model against full test data
best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

796/796 - 0s - 615us/step - accuracy: 0.5974 - loss: nan
Loss: nan, Accuracy: 0.5973607897758484


In [15]:
# Train the model
fit_model = best_model.fit(X_train_scaled,y_train,epochs=200)

Epoch 1/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 530us/step - accuracy: 0.5951 - loss: nan
Epoch 2/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 525us/step - accuracy: 0.5929 - loss: nan
Epoch 3/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 555us/step - accuracy: 0.5940 - loss: nan
Epoch 4/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 548us/step - accuracy: 0.5910 - loss: nan
Epoch 5/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 551us/step - accuracy: 0.5945 - loss: nan
Epoch 6/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533us/step - accuracy: 0.5923 - loss: nan
Epoch 7/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 540us/step - accuracy: 0.5928 - loss: nan
Epoch 8/200
[1m2388/2388[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 529us/step - accuracy: 0.5925 - loss: nan
Epoch 9/200
[1m2388/238

In [16]:
# Evaluate the model using the test data
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

796/796 - 0s - 394us/step - accuracy: 0.5974 - loss: nan
Loss: nan, Accuracy: 0.5973607897758484


In [17]:
# Reshape y if it's already a NumPy array
y = y.ravel()

# Using Random Forest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.9682866961217477
