In [112]:
!git clone https://github.com/anissapatel/ML4VA.git

fatal: destination path 'ML4VA' already exists and is not an empty directory.


In [113]:
!pip install geopandas



In [114]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import randint
import requests
import geopandas as gpd
import numpy as np
np.random.seed(42)

In [115]:
# Define the API URL
url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
response = requests.get(url)

In [116]:
# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Load the GeoJSON data into a GeoDataFrame
    geo_data = gpd.read_file(response.text)
    # Convert to DataFrame if you don’t need geometry
    df = pd.DataFrame(geo_data.drop(columns='geometry'))
    print("Data successfully converted to DataFrame!")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

df.head()

Data fetched successfully!
Data successfully converted to DataFrame!


Unnamed: 0,OBJECTID,DOCUMENT_NBR,CRASH_YEAR,CRASH_DT,CRASH_MILITARY_TM,CRASH_SEVERITY,K_PEOPLE,A_PEOPLE,B_PEOPLE,C_PEOPLE,...,AREA_TYPE,SYSTEM,VSP,OWNERSHIP,PLAN_DISTRICT,MPO_NAME,RTE_NM,RNS_MP,NODE,OFFSET
0,1,163465085,2016,2016-12-06 05:00:00+00:00,700,O,0,0,0,0,...,Urban,VDOT Interstate,7,1. State Hwy Agency,Northern Virginia,NOVA,R-VA IS00095NB,158.85,,
1,2,160725125,2016,2016-01-26 05:00:00+00:00,1636,O,0,0,0,0,...,Urban,NonVDOT secondary,5,3. City or Town Hwy Agency,Hampton Roads,HAMP,S-VA114NP WOODLAND RD,0.5,253154.0,318.32
2,3,160465260,2016,2016-02-13 05:00:00+00:00,1845,B,0,0,2,0,...,Rural,VDOT Secondary,1,1. State Hwy Agency,Northern Neck,,R-VA066SC00640NB,2.54,1149141.0,5.82
3,4,161255113,2016,2016-03-26 04:00:00+00:00,1,O,0,0,0,0,...,Urban,VDOT Secondary,1,1. State Hwy Agency,Richmond Regional,RICH,R-VA042SC00782SB,1.19,,
4,5,162005180,2016,2016-07-18 04:00:00+00:00,732,O,0,0,0,0,...,Rural,VDOT Primary,6,1. State Hwy Agency,"Roanoke Valley-Alleghany, West Piedmont",,R-VA US00220NB,48.61,328737.0,5.34


In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   OBJECTID                  2000 non-null   int32              
 1   DOCUMENT_NBR              2000 non-null   int32              
 2   CRASH_YEAR                2000 non-null   object             
 3   CRASH_DT                  2000 non-null   datetime64[ms, UTC]
 4   CRASH_MILITARY_TM         2000 non-null   object             
 5   CRASH_SEVERITY            2000 non-null   object             
 6   K_PEOPLE                  2000 non-null   int16              
 7   A_PEOPLE                  2000 non-null   int16              
 8   B_PEOPLE                  2000 non-null   int16              
 9   C_PEOPLE                  2000 non-null   int16              
 10  PERSONS_INJURED           2000 non-null   int16              
 11  PEDESTRIANS_KILLE

In [118]:
df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY","COLLISION_TYPE", "WEATHER_CONDITION",
         "LIGHT_CONDITION","ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY","ROADWAY_ALIGNMENT",
         "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION","INTERSECTION_TYPE","SCHOOL_ZONE",
         "SPEED_NOTSPEED","INTERSECTION_ANALYSIS", "MAINLINE_YN","NIGHT","VDOT_DISTRICT",
         "AREA_TYPE","RTE_NM"]]
df.head()

Unnamed: 0,CRASH_YEAR,CRASH_DT,CRASH_SEVERITY,COLLISION_TYPE,WEATHER_CONDITION,LIGHT_CONDITION,ROADWAY_SURFACE_COND,RELATION_TO_ROADWAY,ROADWAY_ALIGNMENT,ROADWAY_DEFECT,ROADWAY_DESCRIPTION,INTERSECTION_TYPE,SCHOOL_ZONE,SPEED_NOTSPEED,INTERSECTION_ANALYSIS,MAINLINE_YN,NIGHT,VDOT_DISTRICT,AREA_TYPE,RTE_NM
0,2016,2016-12-06 05:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"3. Two-Way, Divided, Positive Median Barrier",1. Not at Intersection,3. No,Yes,Not Intersection,Yes,Yes,9. Northern Virginia,Urban,R-VA IS00095NB
1,2016,2016-01-26 05:00:00+00:00,O,2. Angle,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,1. Main-Line Roadway,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,No,5. Hampton Roads,Urban,S-VA114NP WOODLAND RD
2,2016,2016-02-13 05:00:00+00:00,B,9. Fixed Object - Off Road,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,VDOT Intersection,Yes,Yes,6. Fredericksburg,Rural,R-VA066SC00640NB
3,2016,2016-03-26 04:00:00+00:00,O,1. Rear End,5. Rain,5. Darkness - Road Not Lighted,2. Wet,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,Yes,4. Richmond,Urban,R-VA042SC00782SB
4,2016,2016-07-18 04:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,9. Within Intersection,3. Grade - Straight,1. No Defects,"2. Two-Way, Divided, Unprotected Median",3. Three Approaches,3. No,No,VDOT Intersection,Yes,No,2. Salem,Rural,R-VA US00220NB


In [119]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   CRASH_YEAR             2000 non-null   object             
 1   CRASH_DT               2000 non-null   datetime64[ms, UTC]
 2   CRASH_SEVERITY         2000 non-null   object             
 3   COLLISION_TYPE         2000 non-null   object             
 4   WEATHER_CONDITION      2000 non-null   object             
 5   LIGHT_CONDITION        2000 non-null   object             
 6   ROADWAY_SURFACE_COND   2000 non-null   object             
 7   RELATION_TO_ROADWAY    2000 non-null   object             
 8   ROADWAY_ALIGNMENT      2000 non-null   object             
 9   ROADWAY_DEFECT         2000 non-null   object             
 10  ROADWAY_DESCRIPTION    2000 non-null   object             
 11  INTERSECTION_TYPE      2000 non-null   object           

In [120]:
# Filter and preprocess dataset
df['CRASH_SEVERITY'] = df['CRASH_SEVERITY'].map({'O': 0, 'B': 1, 'C': 2, 'A': 3, 'K': 4})
df.dropna(subset=['CRASH_SEVERITY'], inplace=True)

# Feature engineering
df['CRASH_DT'] = pd.to_datetime(df['CRASH_DT'])
df['hour'] = df['CRASH_DT'].dt.hour
df['day_of_week'] = df['CRASH_DT'].dt.dayofweek
df['month'] = df['CRASH_DT'].dt.month

In [121]:
# Step 3: Define target and feature columns
target_column = "CRASH_SEVERITY"
# categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
#                        "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#                        "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
#                        "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
#                        "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]
categorical_columns = [
    "COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND",
    "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT", "VDOT_DISTRICT"
]
numerical_columns = ["hour", "day_of_week", "month"]

In [122]:
# Split data into features (X) and target (y)
X = df[numerical_columns + categorical_columns]
y = df[target_column]

In [123]:
# Encode the target variable
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
categorical_preprocessor = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
preprocessor = ColumnTransformer([
    ('cat', categorical_preprocessor, categorical_columns)
], remainder='passthrough')

In [126]:
# Define the pipeline with preprocessing and model
rf_model = RandomForestRegressor(random_state=42)
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', rf_model)
])

In [127]:
# Step 5: Initial Model Evaluation using Cross-Validation
rf_pipeline.fit(X_train, y_train)
rf_preds = rf_pipeline.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_preds))
print(f"Random Forest RMSE: {rf_rmse}")

Random Forest RMSE: 1.01618600258508


In [128]:
param_grid = {
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

In [129]:
# rf_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', RandomForestRegressor(random_state=42))
# ])

In [130]:
grid_search_rf = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=-1
)

In [131]:
# Fit GridSearchCV
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


In [132]:
best_rf_model = grid_search_rf.best_estimator_
rf_preds_tuned = best_rf_model.predict(X_test)
rf_rmse_tuned = np.sqrt(mean_squared_error(y_test, rf_preds_tuned))

print(f"Best Parameters (Random Forest): {grid_search_rf.best_params_}")
print(f"Random Forest RMSE after tuning: {rf_rmse_tuned}")

Best Parameters (Random Forest): {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 4, 'model__min_samples_split': 10, 'model__n_estimators': 100}
Random Forest RMSE after tuning: 0.9371556778555059


In [133]:
# View GridSearchCV results in a DataFrame
results_df = pd.DataFrame(grid_search_rf.cv_results_)
results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(by="mean_test_score", ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score
52,"{'model__max_depth': 10, 'model__max_features'...",-0.824938,0.013486
53,"{'model__max_depth': 10, 'model__max_features'...",-0.825628,0.013706
160,"{'model__max_depth': 20, 'model__max_features'...",-0.825976,0.015367
51,"{'model__max_depth': 10, 'model__max_features'...",-0.826133,0.014662
241,"{'model__max_depth': None, 'model__max_feature...",-0.826515,0.016064
...,...,...,...
184,"{'model__max_depth': None, 'model__max_feature...",,
185,"{'model__max_depth': None, 'model__max_feature...",,
186,"{'model__max_depth': None, 'model__max_feature...",,
187,"{'model__max_depth': None, 'model__max_feature...",,


In [134]:
param_distributions = {
    'model__n_estimators': randint(50, 200),
    'model__max_depth': [10, 20, None],
    'model__min_samples_split': randint(2, 15),
    'model__min_samples_leaf': randint(1, 10),
    'model__max_features': ['auto', 'sqrt', 'log2'],
    'model__bootstrap': [True, False]
}

In [135]:
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

In [136]:
random_search_rf = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_distributions,
    n_iter=50,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=3,
    n_jobs=-1,
    random_state=42
)

In [137]:
# Fit RandomizedSearchCV
random_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [138]:
# Display the best hyperparameters from RandomizedSearchCV
print("Best Hyperparameters (RandomizedSearchCV):", random_search_rf.best_params_)

Best Hyperparameters (RandomizedSearchCV): {'model__bootstrap': False, 'model__max_depth': 10, 'model__max_features': 'log2', 'model__min_samples_leaf': 5, 'model__min_samples_split': 4, 'model__n_estimators': 184}


In [139]:
best_rf_model_random = random_search_rf.best_estimator_

In [140]:
# Evaluate on test data with the best model from RandomizedSearchCV
rf_random_preds = best_rf_model_random.predict(X_test)
rf_random_rmse = np.sqrt(mean_squared_error(y_test, rf_random_preds))

print(f"Best Parameters (RandomizedSearchCV - Random Forest): {random_search_rf.best_params_}")
print(f"Random Forest RMSE after RandomizedSearchCV tuning: {rf_random_rmse}")

Best Parameters (RandomizedSearchCV - Random Forest): {'model__bootstrap': False, 'model__max_depth': 10, 'model__max_features': 'log2', 'model__min_samples_leaf': 5, 'model__min_samples_split': 4, 'model__n_estimators': 184}
Random Forest RMSE after RandomizedSearchCV tuning: 0.9343544038931064


In [141]:
# Display each combination's performance from RandomizedSearchCV
cvres = random_search_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

0.912774002869493 {'model__bootstrap': True, 'model__max_depth': 10, 'model__max_features': 'log2', 'model__min_samples_leaf': 8, 'model__min_samples_split': 14, 'model__n_estimators': 70}
0.9114624299979243 {'model__bootstrap': True, 'model__max_depth': 20, 'model__max_features': 'log2', 'model__min_samples_leaf': 7, 'model__min_samples_split': 12, 'model__n_estimators': 137}
0.9108156549460498 {'model__bootstrap': True, 'model__max_depth': None, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 5, 'model__min_samples_split': 3, 'model__n_estimators': 137}
0.9178144082018241 {'model__bootstrap': False, 'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 13, 'model__n_estimators': 70}
0.9117107695763469 {'model__bootstrap': True, 'model__max_depth': 20, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 9, 'model__min_samples_split': 2, 'model__n_estimators': 108}
0.9091844727428847 {'model__bootstrap': True, '

In [142]:
# Extract and display feature importances from the best model
if hasattr(best_rf_model_random.named_steps['model'], 'feature_importances_'):
    feature_importances = best_rf_model_random.named_steps['model'].feature_importances_
    encoded_categorical_columns = best_rf_model_random.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_columns)
    all_feature_names = np.concatenate([encoded_categorical_columns, numerical_columns])
    min_length = min(len(feature_importances), len(all_feature_names))
    aligned_feature_importances = feature_importances[:min_length]
    aligned_feature_names = all_feature_names[:min_length]
    importance_df = pd.DataFrame({"Feature": aligned_feature_names, "Importance": aligned_feature_importances}).sort_values(by="Importance", ascending=False)
    print("Top 10 Features by Importance:")
    print(importance_df.head(10))

Top 10 Features by Importance:
                                       Feature  Importance
70                                       month    0.069607
63              VDOT_DISTRICT_5. Hampton Roads    0.066936
1                      COLLISION_TYPE_10. Deer    0.064288
3                       COLLISION_TYPE_12. Ped    0.063026
69                                 day_of_week    0.056184
47     RELATION_TO_ROADWAY_8. Non-Intersection    0.043458
48  RELATION_TO_ROADWAY_9. Within Intersection    0.041996
30                 ROADWAY_SURFACE_COND_2. Wet    0.041815
12   COLLISION_TYPE_9. Fixed Object - Off Road    0.041422
51          ROADWAY_ALIGNMENT_2. Curve - Level    0.041045


# Summary of results:
**Intial cross-validation RMSE Scores (before tuning)**

[1.592, 1.474, 1.492, 1.566, 1.389, 1.498, 1.534, 1.520, 1.512, 1.351]

*  Mean RMSE: 1.493

---



**Best Hyperparamters form GridSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 3, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}

---



**Final test RMSE and MAE from GridSearchCV**

* Final Test RMSE: 1.396
* Final Test MAE: 1.260


---

**Best Hyperparameters from RandomizedSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


---

**Final test RMSE and MAE from RandomizedSearchCV**

*   Final Test RMSE: 1.405
*   Final Test MAE: 1.259


---

 **Feature Importances (Top 10 Features)**

1. Importance: 0.013178 | Feature: COLLISION_TYPE_12. Ped
2. Importance: 0.012126 | Feature: COLLISION_TYPE_9. Fixed Object - Off Road
3. Importance: 0.011911 | Feature: ROADWAY_SURFACE_COND_2. Wet
4. Importance: 0.011283 | Feature: ROADWAY_DESCRIPTION_3. Two-Way, Divided, Positive
5. Importance: 0.011151 | Feature: ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
6. Importance: 0.010915 | Feature: COLLISION_TYPE_1. Rear End
7. Importance: 0.010695 | Feature: VDOT_DISTRICT_5. Hampton Roads
8. Importance: 0.010624 | Feature: COLLISION_TYPE_10. Deer
9. Importance: 0.010613 | Feature: LIGHT_CONDITION_2. Daylight
10. Importance: 0.010547 | Feature: ROADWAY_ALIGNMENT_1. Straight - Level






In [143]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Step 1: Define the Gradient Boosting model and pipeline
gb_model = HistGradientBoostingRegressor(random_state=42)
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', gb_model)
])

In [144]:
# Step 2: GridSearchCV for fine-tuning
gb_param_grid = {
    'model__max_depth': [None, 3, 5, 10],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_iter': [100, 200, 300],
    'model__l2_regularization': [0.0, 0.1, 1.0],
}

In [145]:
grid_search = GridSearchCV(
    gb_pipeline,
    param_grid=gb_param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

In [146]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [147]:
# Best model and performance from GridSearchCV
best_gb_model = grid_search.best_estimator_
print("Best Parameters (GridSearchCV):", grid_search.best_params_)

Best Parameters (GridSearchCV): {'model__l2_regularization': 0.1, 'model__learning_rate': 0.01, 'model__max_depth': 3, 'model__max_iter': 200}


In [148]:
# Evaluate on the test set
gb_preds_grid = best_gb_model.predict(X_test)
gb_rmse_grid = np.sqrt(mean_squared_error(y_test, gb_preds_grid))
print(f"Gradient Boosting RMSE (GridSearchCV): {gb_rmse_grid}")

Gradient Boosting RMSE (GridSearchCV): 0.9293361784500187


In [149]:
# Step 3: RandomizedSearchCV for further fine-tuning
gb_param_dist = {
    'model__max_depth': [None, 3, 5, 10],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'model__max_iter': [100, 200, 300, 400],
    'model__l2_regularization': [0.0, 0.1, 1.0, 10.0],
    'model__min_samples_leaf': [10, 20, 50],
}

In [150]:
random_search = RandomizedSearchCV(
    gb_pipeline,
    param_distributions=gb_param_dist,
    n_iter=50,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

In [151]:
# Fit the RandomizedSearchCV
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


In [152]:
# Best model and performance from RandomizedSearchCV
best_gb_model_random = random_search.best_estimator_
print("Best Parameters (RandomizedSearchCV):", random_search.best_params_)

Best Parameters (RandomizedSearchCV): {'model__min_samples_leaf': 10, 'model__max_iter': 400, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__l2_regularization': 0.1}


In [153]:
# Evaluate on the test set
gb_preds_random = best_gb_model_random.predict(X_test)
gb_rmse_random = np.sqrt(mean_squared_error(y_test, gb_preds_random))
print(f"Gradient Boosting RMSE (RandomizedSearchCV): {gb_rmse_random}")

Gradient Boosting RMSE (RandomizedSearchCV): 0.9362029718660315


In [154]:
# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_processed)
X_test_scaled = scaler.transform(X_test_processed)

# Reshape for LSTM input
X_train_lstm = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

In [155]:
# Step 2: Define the model-building function
def build_lstm_model(hp):
    model = Sequential()
    model.add(LSTM(units=hp.Int('units_1', min_value=32, max_value=128, step=32),
                   activation=hp.Choice('activation_1', values=['relu', 'tanh']),
                   input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2]),
                   return_sequences=True))
    model.add(Dropout(rate=hp.Float('dropout_1', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(LSTM(units=hp.Int('units_2', min_value=16, max_value=64, step=16),
                   activation=hp.Choice('activation_2', values=['relu', 'tanh'])))
    model.add(Dropout(rate=hp.Float('dropout_2', min_value=0.1, max_value=0.5, step=0.1)))
    model.add(Dense(1, activation='linear'))
    model.compile(optimizer=hp.Choice('optimizer', values=['adam', 'rmsprop']),
                  loss='mse',
                  metrics=['mae'])
    return model

In [156]:
# Step 3: Set up the hyperparameter tuner
tuner = RandomSearch(
    build_lstm_model,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory='lstm_tuning',
    project_name='traffic_severity'
)

Reloading Tuner from lstm_tuning/traffic_severity/tuner0.json


In [157]:
# Step 4: Define early stopping and run the search
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
tuner.search(X_train_lstm, y_train,
             validation_split=0.2,
             epochs=50,
             batch_size=32,
             callbacks=[early_stopping],
             verbose=1)

# Step 5: Get the best model and evaluate
best_hyperparameters = tuner.get_best_hyperparameters(num_trials=1)[0]
best_model = tuner.get_best_models(num_models=1)[0]

In [158]:
# Evaluate on test set
lstm_preds = best_model.predict(X_test_lstm)
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_preds))
print(f"Best Hyperparameters: {best_hyperparameters.values}")
print(f"LSTM RMSE: {lstm_rmse}")

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step
Best Hyperparameters: {'units_1': 96, 'activation_1': 'relu', 'dropout_1': 0.30000000000000004, 'units_2': 48, 'activation_2': 'relu', 'dropout_2': 0.4, 'optimizer': 'rmsprop'}
LSTM RMSE: 0.9378803155432196
