<a href="https://colab.research.google.com/github/anissapatel/ML4VA/blob/main/ML4VA_Traffic_Patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/anissapatel/ML4VA.git

fatal: destination path 'ML4VA' already exists and is not an empty directory.


In [2]:
!pip install geopandas



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import randint
import requests
import geopandas as gpd
import numpy as np
np.random.seed(42)

In [4]:
# Define the API URL
url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
response = requests.get(url)

In [5]:
# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Load the GeoJSON data into a GeoDataFrame
    geo_data = gpd.read_file(response.text)
    # Convert to DataFrame if you don’t need geometry
    df = pd.DataFrame(geo_data.drop(columns='geometry'))
    print("Data successfully converted to DataFrame!")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

df.head()

Data fetched successfully!
Data successfully converted to DataFrame!


Unnamed: 0,OBJECTID,DOCUMENT_NBR,CRASH_YEAR,CRASH_DT,CRASH_MILITARY_TM,CRASH_SEVERITY,K_PEOPLE,A_PEOPLE,B_PEOPLE,C_PEOPLE,...,AREA_TYPE,SYSTEM,VSP,OWNERSHIP,PLAN_DISTRICT,MPO_NAME,RTE_NM,RNS_MP,NODE,OFFSET
0,1,163465085,2016,2016-12-06 05:00:00+00:00,700,O,0,0,0,0,...,Urban,VDOT Interstate,7,1. State Hwy Agency,Northern Virginia,NOVA,R-VA IS00095NB,158.85,,
1,2,160725125,2016,2016-01-26 05:00:00+00:00,1636,O,0,0,0,0,...,Urban,NonVDOT secondary,5,3. City or Town Hwy Agency,Hampton Roads,HAMP,S-VA114NP WOODLAND RD,0.5,253154.0,318.32
2,3,160465260,2016,2016-02-13 05:00:00+00:00,1845,B,0,0,2,0,...,Rural,VDOT Secondary,1,1. State Hwy Agency,Northern Neck,,R-VA066SC00640NB,2.54,1149141.0,5.82
3,4,161255113,2016,2016-03-26 04:00:00+00:00,1,O,0,0,0,0,...,Urban,VDOT Secondary,1,1. State Hwy Agency,Richmond Regional,RICH,R-VA042SC00782SB,1.19,,
4,5,162005180,2016,2016-07-18 04:00:00+00:00,732,O,0,0,0,0,...,Rural,VDOT Primary,6,1. State Hwy Agency,"Roanoke Valley-Alleghany, West Piedmont",,R-VA US00220NB,48.61,328737.0,5.34


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   OBJECTID                  2000 non-null   int32              
 1   DOCUMENT_NBR              2000 non-null   int32              
 2   CRASH_YEAR                2000 non-null   object             
 3   CRASH_DT                  2000 non-null   datetime64[ms, UTC]
 4   CRASH_MILITARY_TM         2000 non-null   object             
 5   CRASH_SEVERITY            2000 non-null   object             
 6   K_PEOPLE                  2000 non-null   int16              
 7   A_PEOPLE                  2000 non-null   int16              
 8   B_PEOPLE                  2000 non-null   int16              
 9   C_PEOPLE                  2000 non-null   int16              
 10  PERSONS_INJURED           2000 non-null   int16              
 11  PEDESTRIANS_KILLE

In [7]:
df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY","COLLISION_TYPE", "WEATHER_CONDITION",
         "LIGHT_CONDITION","ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY","ROADWAY_ALIGNMENT",
         "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION","INTERSECTION_TYPE","SCHOOL_ZONE",
         "SPEED_NOTSPEED","INTERSECTION_ANALYSIS", "MAINLINE_YN","NIGHT","VDOT_DISTRICT",
         "AREA_TYPE","RTE_NM"]]
df.head()

Unnamed: 0,CRASH_YEAR,CRASH_DT,CRASH_SEVERITY,COLLISION_TYPE,WEATHER_CONDITION,LIGHT_CONDITION,ROADWAY_SURFACE_COND,RELATION_TO_ROADWAY,ROADWAY_ALIGNMENT,ROADWAY_DEFECT,ROADWAY_DESCRIPTION,INTERSECTION_TYPE,SCHOOL_ZONE,SPEED_NOTSPEED,INTERSECTION_ANALYSIS,MAINLINE_YN,NIGHT,VDOT_DISTRICT,AREA_TYPE,RTE_NM
0,2016,2016-12-06 05:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"3. Two-Way, Divided, Positive Median Barrier",1. Not at Intersection,3. No,Yes,Not Intersection,Yes,Yes,9. Northern Virginia,Urban,R-VA IS00095NB
1,2016,2016-01-26 05:00:00+00:00,O,2. Angle,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,1. Main-Line Roadway,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,No,5. Hampton Roads,Urban,S-VA114NP WOODLAND RD
2,2016,2016-02-13 05:00:00+00:00,B,9. Fixed Object - Off Road,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,VDOT Intersection,Yes,Yes,6. Fredericksburg,Rural,R-VA066SC00640NB
3,2016,2016-03-26 04:00:00+00:00,O,1. Rear End,5. Rain,5. Darkness - Road Not Lighted,2. Wet,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,Yes,4. Richmond,Urban,R-VA042SC00782SB
4,2016,2016-07-18 04:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,9. Within Intersection,3. Grade - Straight,1. No Defects,"2. Two-Way, Divided, Unprotected Median",3. Three Approaches,3. No,No,VDOT Intersection,Yes,No,2. Salem,Rural,R-VA US00220NB


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   CRASH_YEAR             2000 non-null   object             
 1   CRASH_DT               2000 non-null   datetime64[ms, UTC]
 2   CRASH_SEVERITY         2000 non-null   object             
 3   COLLISION_TYPE         2000 non-null   object             
 4   WEATHER_CONDITION      2000 non-null   object             
 5   LIGHT_CONDITION        2000 non-null   object             
 6   ROADWAY_SURFACE_COND   2000 non-null   object             
 7   RELATION_TO_ROADWAY    2000 non-null   object             
 8   ROADWAY_ALIGNMENT      2000 non-null   object             
 9   ROADWAY_DEFECT         2000 non-null   object             
 10  ROADWAY_DESCRIPTION    2000 non-null   object             
 11  INTERSECTION_TYPE      2000 non-null   object           

In [9]:
# Step 3: Define target and feature columns
target_column = "CRASH_SEVERITY"
categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
                       "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
                       "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
                       "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
                       "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

In [10]:
# Split data into features (X) and target (y)
X = df[categorical_columns]
y = df[target_column]

In [11]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [12]:
# Further split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [13]:
# Step 4: Define preprocessing and model pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
])

In [14]:
# Define the pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [15]:
# Step 5: Initial Model Evaluation using Cross-Validation
cross_val_scores = cross_val_score(pipeline, X_train, y_train, scoring="neg_mean_squared_error", cv=10, error_score='raise')
initial_rmse_scores = np.sqrt(-cross_val_scores)
print("Initial Cross-Validation RMSE Scores:", initial_rmse_scores)
print("Mean RMSE:", initial_rmse_scores.mean())

Initial Cross-Validation RMSE Scores: [1.59203487 1.4726939  1.49702261 1.5625711  1.38888185 1.49907511
 1.53438176 1.51618302 1.51162677 1.3523169 ]
Mean RMSE: 1.4926787870736444


In [16]:
# Step 6: Hyperparameter tuning with GridSearchCV
param_grid = {'regressor__n_estimators': [50, 100, 150],
              'regressor__max_features': [3, 5, 7],
              'regressor__max_depth': [10, 15, 20, None],  # Add max depth to control overfitting
              'regressor__min_samples_split': [2, 5, 10],   # Minimum samples required to split
              'regressor__bootstrap': [False]
}

grid_search = GridSearchCV(pipeline,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True
)

In [17]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
# Display the best hyperparameters and best model
print("Best Hyperparameters (GridSearchCV):", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
# Step 7: Model evaluation on test set
y_test_predictions = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_test_predictions)
final_rmse = np.sqrt(final_mse)
print("Final Test RMSE (GridSearchCV):", final_rmse)

# Mean Absolute Error (optional)
final_mae = mean_absolute_error(y_test, y_test_predictions)
print("Final Test MAE (GridSearchCV):", final_mae)

In [None]:
# View GridSearchCV results in a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(by="mean_test_score", ascending=False))

In [None]:
# Step 8: Further tuning with RandomizedSearchCV
param_distribs = {'regressor__n_estimators': randint(low=50, high=200),
                  'regressor__max_features': randint(low=3, high=8),
                  'regressor__max_depth': [10, 15, 20, None],
                  'regressor__min_samples_split': [2, 5, 10],
                  'regressor__bootstrap': [False]
}

rnd_search = RandomizedSearchCV(pipeline,
                                param_distributions=param_distribs,
                                n_iter=15,  # Try 15 random combinations
                                cv=5,
                                scoring='neg_mean_squared_error',
                                random_state=42
)

In [None]:
# Fit RandomizedSearchCV
rnd_search.fit(X_train, y_train)

In [None]:
# Display the best hyperparameters from RandomizedSearchCV
print("Best Hyperparameters (RandomizedSearchCV):", rnd_search.best_params_)

In [None]:
# Evaluate on test data with the best model from RandomizedSearchCV
y_test_predictions_rnd = rnd_search.best_estimator_.predict(X_test)
final_mse_rnd = mean_squared_error(y_test, y_test_predictions_rnd)
final_rmse_rnd = np.sqrt(final_mse_rnd)
print("Final Test RMSE (RandomizedSearchCV):", final_rmse_rnd)

# Display Mean Absolute Error
final_mae_rnd = mean_absolute_error(y_test, y_test_predictions_rnd)
print("Final Test MAE (RandomizedSearchCV):", final_mae_rnd)

In [None]:
# Display each combination's performance from RandomizedSearchCV
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

In [None]:
# Get feature importances from the best model if available
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    feature_importances = best_model.named_steps['regressor'].feature_importances_
    encoded_columns = best_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_columns)
    importance_df = pd.DataFrame(sorted(zip(feature_importances, encoded_columns), reverse=True), columns=["Importance", "Feature"])
    print(importance_df.head(10))  # Display top 10 most important features

# Summary of results:
**Intial cross-validation RMSE Scores (before tuning)**

[1.592, 1.474, 1.492, 1.566, 1.389, 1.498, 1.534, 1.520, 1.512, 1.351]

*  Mean RMSE: 1.493

---



**Best Hyperparamters form GridSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 3, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}

---



**Final test RMSE and MAE from GridSearchCV**

* Final Test RMSE: 1.396
* Final Test MAE: 1.260


---

**Best Hyperparameters from RandomizedSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


---

**Final test RMSE and MAE from RandomizedSearchCV**

*   Final Test RMSE: 1.405
*   Final Test MAE: 1.259


---

 **Feature Importances (Top 10 Features)**

1. Importance: 0.013178 | Feature: COLLISION_TYPE_12. Ped
2. Importance: 0.012126 | Feature: COLLISION_TYPE_9. Fixed Object - Off Road
3. Importance: 0.011911 | Feature: ROADWAY_SURFACE_COND_2. Wet
4. Importance: 0.011283 | Feature: ROADWAY_DESCRIPTION_3. Two-Way, Divided, Positive
5. Importance: 0.011151 | Feature: ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
6. Importance: 0.010915 | Feature: COLLISION_TYPE_1. Rear End
7. Importance: 0.010695 | Feature: VDOT_DISTRICT_5. Hampton Roads
8. Importance: 0.010624 | Feature: COLLISION_TYPE_10. Deer
9. Importance: 0.010613 | Feature: LIGHT_CONDITION_2. Daylight
10. Importance: 0.010547 | Feature: ROADWAY_ALIGNMENT_1. Straight - Level






In [None]:
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import randint

# Select important categorical features
important_features = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "ROADWAY_DESCRIPTION","VDOT_DISTRICT", "ROADWAY_ALIGNMENT"]


# Preprocessor for categorical features
categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), important_features)
    ]
)

In [None]:
# Create the full pipeline
pipeline = Pipeline([
    ('cat_preprocessor', categorical_preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, subsample=0.9))
])

In [None]:
# Parameter distribution for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__max_depth': randint(3, 7),
    'classifier__min_samples_split': randint(50, 150),
    'classifier__min_samples_leaf': randint(20, 50)
}

# Setup RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=3,
    random_state=42
)

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Time the fitting process
start_time = time.time()
rnd_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Display results
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print("Best Parameters:", rnd_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(rnd_search.best_score_))
print("Test Set Score: {:.2f}".format(rnd_search.score(X_test, y_test)))

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV, train_test_split

# # Step 1: Define the preprocessing step
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
#     ]
# )

# # Step 2: Preprocess X_train before SMOTE
# X_train_encoded = preprocessor.fit_transform(X_train)


In [None]:
# # Oversample with SMOTE
# sm = SMOTE(random_state=42)
# X_resampled, y_resampled = sm.fit_resample(X_train_encoded, y_train)

In [None]:
# # Define Gradient Boosting pipeline
# gb_pipeline = Pipeline([
#     ('classifier', GradientBoostingClassifier(random_state=42))
# ])

In [None]:
# # Hyperparameter grid for Gradient Boosting
# gb_param_grid = {
#     'classifier__n_estimators': [100, 500],
#     'classifier__learning_rate': [0.01, 0.1],
#     'classifier__max_depth': [3, 5],
#     'classifier__subsample': [0.8, 1.0],
#     'classifier__min_samples_split': [2, 10],
#     'classifier__min_samples_leaf': [1, 5]
# }

# # Perform grid search
# gb_random_search = RandomizedSearchCV(
#     gb_pipeline,
#     gb_param_grid,
#     n_iter=30,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

In [None]:
# # Train Gradient Boosting with progress logging
# gb_random_search.fit(X_resampled, y_resampled)

In [None]:
# # Display the best hyperparameters from RandomizedSearchCV
# print("Best Parameters (Gradient Boosting):", gb_random_search.best_params_)

In [None]:
# # Predictions and evaluation for Gradient Boosting
# X_test_encoded = preprocessor.transform(X_test)
# best_model = gb_random_search.best_estimator_
# y_pred = best_model.predict(X_test_encoded)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
print(np.unique(y))  # Check unique labels in your target variable
print(X_seq.shape)  # Ensure it's not too sparse or imbalanced
print(np.bincount(y_seq))
y_train = np.array(y_train)  # Ensure this is a flat array, not 2D
y_test = np.array(y_test)    # Ensure this is a flat array as well

In [18]:
import tensorflow as tf
print(tf.__version__)


2.18.0


In [19]:
!pip install --upgrade tensorflow



In [21]:
print(tf.__version__)

2.18.0


In [28]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [48]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from scipy.stats import randint
import time

# Preprocess features
important_features = [
    "COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND",
    "ROADWAY_DESCRIPTION", "VDOT_DISTRICT", "ROADWAY_ALIGNMENT"
]

# Preprocessor for categorical features
categorical_preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), important_features)]
)

# Function to create the LSTM model
def create_lstm_model(input_shape, lstm_units=256, dropout_rate=0.2, learning_rate=0.001):
    model = Sequential([
        LSTM(lstm_units, input_shape=input_shape, return_sequences=True, kernel_regularizer=regularizers.l2(0.001)),
        Dropout(dropout_rate),
        LSTM(lstm_units),
        Dropout(dropout_rate),
        Dense(256, activation='relu'),
        Dense(len(np.unique(y)), activation='softmax')  # Assumes multi-class classification
    ])

    # Use Adam optimizer with specified learning rate
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

# Example: Split your data into training and test sets (replace X and y with your actual data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert X_train and X_test to NumPy arrays if they are not already
X_train = X_train.values
X_test = X_test.values

# Reshape data to match LSTM input requirements: (samples, timesteps, features)
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))  # Assuming 1 timestep per sample
X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))  # Assuming 1 timestep per sample

# Create a KerasClassifier wrapper
lstm_model = KerasClassifier(build_fn=create_lstm_model, input_shape=(X_train.shape[1], X_train.shape[2]),
                             epochs=10, batch_size=32, verbose=0)

# Create the full pipeline
pipeline = Pipeline([
    ('cat_preprocessor', categorical_preprocessor),  # Preprocessing step for categorical features
    ('lstm_model', lstm_model)  # The LSTM model step
])

# Hyperparameter grid for RandomizedSearchCV (we are fine-tuning the LSTM units, dropout rate, and learning rate)
param_dist = {
    'lstm_model__lstm_units': randint(128, 512),  # Randomly search between 128 and 512 LSTM units
    'lstm_model__dropout_rate': [0.2, 0.3, 0.4],  # Dropout rates to test
    'lstm_model__learning_rate': [0.001, 0.01, 0.1]  # Learning rates to test
}

# Setup RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=10,  # Use more iterations for better search, e.g., 50 or 100
    cv=3,  # 3-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=3,
    random_state=42
)

# Time the fitting process
start_time = time.time()
rnd_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Display results
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print("Best Parameters:", rnd_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(rnd_search.best_score_))
print("Test Set Score: {:.2f}".format(rnd_search.score(X_test, y_test)))


AttributeError: 'numpy.ndarray' object has no attribute 'values'

In [30]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras import regularizers
# from tensorflow.keras.regularizers import l2
# from scikeras.wrappers import KerasClassifier, KerasRegressor
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.metrics import accuracy_score
# import numpy as np
# from scipy.stats import randint

# # Preprocess features
# important_features = [
#     "COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND",
#     "ROADWAY_DESCRIPTION", "VDOT_DISTRICT", "ROADWAY_ALIGNMENT"
# ]

# # Preprocessor for categorical features
# categorical_preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), important_features)
#     ]
# )

# # Function to create the LSTM model
# def create_lstm_model(input_shape, lstm_units=256, dropout_rate=0.2, learning_rate=0.001):
#     model = Sequential([
#         LSTM(lstm_units, input_shape=input_shape, return_sequences=True, kernel_regularizer=l2(0.001)),
#         Dropout(dropout_rate),
#         LSTM(lstm_units),
#         Dropout(dropout_rate),
#         Dense(256, activation='relu'),
#         Dense(len(np.unique(y)), activation='softmax')
#     ])

#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

#     return model

# # Create a KerasClassifier wrapper
# lstm_model = KerasClassifier(build_fn=create_lstm_model, input_shape=(X_train.shape[1], X_train.shape[2]), epochs=10, batch_size=32, verbose=0)

# # Create the full pipeline
# pipeline = Pipeline([
#     ('cat_preprocessor', categorical_preprocessor),
#     ('lstm_model', lstm_model)
# ])

# # Hyperparameter grid for RandomizedSearchCV
# param_dist = {
#     'lstm_model__lstm_units': randint(128, 512),
#     'lstm_model__dropout_rate': [0.2, 0.3, 0.4],
#     'lstm_model__learning_rate': [0.001, 0.01, 0.1]
# }

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Setup RandomizedSearchCV
# rnd_search = RandomizedSearchCV(
#     pipeline,
#     param_distributions=param_dist,
#     n_iter=10,  # Use more iterations for better search, e.g., 50 or 100
#     cv=3,  # 3-fold cross-validation
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

# # Time the fitting process
# import time
# start_time = time.time()
# rnd_search.fit(X_train, y_train)
# elapsed_time = time.time() - start_time

# # Display results
# print(f"Elapsed time: {elapsed_time:.2f} seconds")
# print("Best Parameters:", rnd_search.best_params_)
# print("Best Cross-validation Score: {:.2f}".format(rnd_search.best_score_))
# print("Test Set Score: {:.2f}".format(rnd_search.score(X_test, y_test)))



IndexError: tuple index out of range

In [None]:
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.optimizers import Adam
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder, OneHotEncoder
# import numpy as np
# import matplotlib.pyplot as plt

# # Data preprocessing
# important_features = [
#     "CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY", "COLLISION_TYPE", "WEATHER_CONDITION",
#     "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#     "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE", "SCHOOL_ZONE",
#     "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS", "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT",
#     "AREA_TYPE", "RTE_NM"
# ]

# # Assuming df is already loaded with data
# le = LabelEncoder()
# y = le.fit_transform(df['CRASH_SEVERITY'])

# ohe = OneHotEncoder(handle_unknown='ignore')
# X = ohe.fit_transform(df[important_features]).toarray()

# sequence_length = 10
# X_seq, y_seq = [], []
# for i in range(len(X) - sequence_length):
#     X_seq.append(X[i:i + sequence_length])
#     y_seq.append(y[i + sequence_length])

# X_seq = np.array(X_seq)
# y_seq = np.array(y_seq)

# X_train, X_test, y_train, y_test = train_test_split(X_seq, y_seq, test_size=0.3, random_state=42)

# # Build the LSTM model
# model = Sequential([
#     LSTM(256, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=True, kernel_regularizer=l2(0.001)),
#     Dropout(0.2),
#     LSTM(256),
#     Dropout(0.2),
#     Dense(256, activation='relu'),
#     Dense(5, activation='softmax')  # Ensure 5 output classes for your task
# ])

# # Compile model
# optimizer = Adam(learning_rate=0.001)
# model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# # # Plot the accuracy
# # plt.plot(history.history['accuracy'])
# # plt.plot(history.history['val_accuracy'])
# # plt.title('Model Accuracy')
# # plt.xlabel('Epochs')
# # plt.ylabel('Accuracy')
# # plt.legend(['Train', 'Validation'])
# # plt.show()

# # # Evaluate on test data
# # test_loss, test_accuracy = model.evaluate(X_test, y_test)
# # print(f"Test Accuracy: {test_accuracy:.2f}")
