<a href="https://colab.research.google.com/github/anissapatel/ML4VA/blob/main/ML4VA_Traffic_Patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/anissapatel/ML4VA.git

Cloning into 'ML4VA'...
remote: Enumerating objects: 26, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 26 (delta 10), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (26/26), 619.74 KiB | 6.39 MiB/s, done.
Resolving deltas: 100% (10/10), done.


In [2]:
!pip install geopandas



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import randint
import requests
import geopandas as gpd
import numpy as np
np.random.seed(42)

In [4]:
# Define the API URL
url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
response = requests.get(url)

In [5]:
# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Load the GeoJSON data into a GeoDataFrame
    geo_data = gpd.read_file(response.text)
    # Convert to DataFrame if you don’t need geometry
    df = pd.DataFrame(geo_data.drop(columns='geometry'))
    print("Data successfully converted to DataFrame!")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

df.head()

Data fetched successfully!
Data successfully converted to DataFrame!


Unnamed: 0,OBJECTID,DOCUMENT_NBR,CRASH_YEAR,CRASH_DT,CRASH_MILITARY_TM,CRASH_SEVERITY,K_PEOPLE,A_PEOPLE,B_PEOPLE,C_PEOPLE,...,AREA_TYPE,SYSTEM,VSP,OWNERSHIP,PLAN_DISTRICT,MPO_NAME,RTE_NM,RNS_MP,NODE,OFFSET
0,1,163465085,2016,2016-12-06 05:00:00+00:00,700,O,0,0,0,0,...,Urban,VDOT Interstate,7,1. State Hwy Agency,Northern Virginia,NOVA,R-VA IS00095NB,158.85,,
1,2,160725125,2016,2016-01-26 05:00:00+00:00,1636,O,0,0,0,0,...,Urban,NonVDOT secondary,5,3. City or Town Hwy Agency,Hampton Roads,HAMP,S-VA114NP WOODLAND RD,0.5,253154.0,318.32
2,3,160465260,2016,2016-02-13 05:00:00+00:00,1845,B,0,0,2,0,...,Rural,VDOT Secondary,1,1. State Hwy Agency,Northern Neck,,R-VA066SC00640NB,2.54,1149141.0,5.82
3,4,161255113,2016,2016-03-26 04:00:00+00:00,1,O,0,0,0,0,...,Urban,VDOT Secondary,1,1. State Hwy Agency,Richmond Regional,RICH,R-VA042SC00782SB,1.19,,
4,5,162005180,2016,2016-07-18 04:00:00+00:00,732,O,0,0,0,0,...,Rural,VDOT Primary,6,1. State Hwy Agency,"Roanoke Valley-Alleghany, West Piedmont",,R-VA US00220NB,48.61,328737.0,5.34


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   OBJECTID                  2000 non-null   int32              
 1   DOCUMENT_NBR              2000 non-null   int32              
 2   CRASH_YEAR                2000 non-null   object             
 3   CRASH_DT                  2000 non-null   datetime64[ms, UTC]
 4   CRASH_MILITARY_TM         2000 non-null   object             
 5   CRASH_SEVERITY            2000 non-null   object             
 6   K_PEOPLE                  2000 non-null   int16              
 7   A_PEOPLE                  2000 non-null   int16              
 8   B_PEOPLE                  2000 non-null   int16              
 9   C_PEOPLE                  2000 non-null   int16              
 10  PERSONS_INJURED           2000 non-null   int16              
 11  PEDESTRIANS_KILLE

In [7]:
df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY","COLLISION_TYPE", "WEATHER_CONDITION",
         "LIGHT_CONDITION","ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY","ROADWAY_ALIGNMENT",
         "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION","INTERSECTION_TYPE","SCHOOL_ZONE",
         "SPEED_NOTSPEED","INTERSECTION_ANALYSIS", "MAINLINE_YN","NIGHT","VDOT_DISTRICT",
         "AREA_TYPE","RTE_NM"]]
df.head()

Unnamed: 0,CRASH_YEAR,CRASH_DT,CRASH_SEVERITY,COLLISION_TYPE,WEATHER_CONDITION,LIGHT_CONDITION,ROADWAY_SURFACE_COND,RELATION_TO_ROADWAY,ROADWAY_ALIGNMENT,ROADWAY_DEFECT,ROADWAY_DESCRIPTION,INTERSECTION_TYPE,SCHOOL_ZONE,SPEED_NOTSPEED,INTERSECTION_ANALYSIS,MAINLINE_YN,NIGHT,VDOT_DISTRICT,AREA_TYPE,RTE_NM
0,2016,2016-12-06 05:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"3. Two-Way, Divided, Positive Median Barrier",1. Not at Intersection,3. No,Yes,Not Intersection,Yes,Yes,9. Northern Virginia,Urban,R-VA IS00095NB
1,2016,2016-01-26 05:00:00+00:00,O,2. Angle,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,1. Main-Line Roadway,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,No,5. Hampton Roads,Urban,S-VA114NP WOODLAND RD
2,2016,2016-02-13 05:00:00+00:00,B,9. Fixed Object - Off Road,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,VDOT Intersection,Yes,Yes,6. Fredericksburg,Rural,R-VA066SC00640NB
3,2016,2016-03-26 04:00:00+00:00,O,1. Rear End,5. Rain,5. Darkness - Road Not Lighted,2. Wet,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,Yes,4. Richmond,Urban,R-VA042SC00782SB
4,2016,2016-07-18 04:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,9. Within Intersection,3. Grade - Straight,1. No Defects,"2. Two-Way, Divided, Unprotected Median",3. Three Approaches,3. No,No,VDOT Intersection,Yes,No,2. Salem,Rural,R-VA US00220NB


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   CRASH_YEAR             2000 non-null   object             
 1   CRASH_DT               2000 non-null   datetime64[ms, UTC]
 2   CRASH_SEVERITY         2000 non-null   object             
 3   COLLISION_TYPE         2000 non-null   object             
 4   WEATHER_CONDITION      2000 non-null   object             
 5   LIGHT_CONDITION        2000 non-null   object             
 6   ROADWAY_SURFACE_COND   2000 non-null   object             
 7   RELATION_TO_ROADWAY    2000 non-null   object             
 8   ROADWAY_ALIGNMENT      2000 non-null   object             
 9   ROADWAY_DEFECT         2000 non-null   object             
 10  ROADWAY_DESCRIPTION    2000 non-null   object             
 11  INTERSECTION_TYPE      2000 non-null   object           

In [9]:
# Step 3: Define target and feature columns
target_column = "CRASH_SEVERITY"
categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
                       "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
                       "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
                       "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
                       "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

In [10]:
# Split data into features (X) and target (y)
X = df[categorical_columns]
y = df[target_column]

In [11]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [12]:
# Further split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [13]:
# Step 4: Define preprocessing and model pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
])

In [14]:
# Define the pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [15]:
# Step 5: Initial Model Evaluation using Cross-Validation
cross_val_scores = cross_val_score(pipeline, X_train, y_train, scoring="neg_mean_squared_error", cv=10, error_score='raise')
initial_rmse_scores = np.sqrt(-cross_val_scores)
print("Initial Cross-Validation RMSE Scores:", initial_rmse_scores)
print("Mean RMSE:", initial_rmse_scores.mean())

Initial Cross-Validation RMSE Scores: [1.59203487 1.4726939  1.49702261 1.5625711  1.38888185 1.49907511
 1.53438176 1.51618302 1.51162677 1.3523169 ]
Mean RMSE: 1.4926787870736444


In [16]:
# Step 6: Hyperparameter tuning with GridSearchCV
param_grid = {'regressor__n_estimators': [50, 100, 150],
              'regressor__max_features': [3, 5, 7],
              'regressor__max_depth': [10, 15, 20, None],  # Add max depth to control overfitting
              'regressor__min_samples_split': [2, 5, 10],   # Minimum samples required to split
              'regressor__bootstrap': [False]
}

grid_search = GridSearchCV(pipeline,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True
)

In [17]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

In [18]:
# Display the best hyperparameters and best model
print("Best Hyperparameters (GridSearchCV):", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best Hyperparameters (GridSearchCV): {'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 7, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}


In [19]:
# Step 7: Model evaluation on test set
y_test_predictions = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_test_predictions)
final_rmse = np.sqrt(final_mse)
print("Final Test RMSE (GridSearchCV):", final_rmse)

# Mean Absolute Error (optional)
final_mae = mean_absolute_error(y_test, y_test_predictions)
print("Final Test MAE (GridSearchCV):", final_mae)

Final Test RMSE (GridSearchCV): 1.3940779185004748
Final Test MAE (GridSearchCV): 1.244900324074074


In [20]:
# View GridSearchCV results in a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(by="mean_test_score", ascending=False))

                                                params  mean_test_score  \
107  {'regressor__bootstrap': False, 'regressor__ma...        -1.993134   
89   {'regressor__bootstrap': False, 'regressor__ma...        -1.994950   
106  {'regressor__bootstrap': False, 'regressor__ma...        -1.995982   
88   {'regressor__bootstrap': False, 'regressor__ma...        -1.999102   
97   {'regressor__bootstrap': False, 'regressor__ma...        -1.999872   
..                                                 ...              ...   
91   {'regressor__bootstrap': False, 'regressor__ma...        -2.129761   
100  {'regressor__bootstrap': False, 'regressor__ma...        -2.142446   
90   {'regressor__bootstrap': False, 'regressor__ma...        -2.146812   
81   {'regressor__bootstrap': False, 'regressor__ma...        -2.148881   
99   {'regressor__bootstrap': False, 'regressor__ma...        -2.162415   

     std_test_score  
107        0.109617  
89         0.103363  
106        0.107436  
88         

In [21]:
# Step 8: Further tuning with RandomizedSearchCV
param_distribs = {'regressor__n_estimators': randint(low=50, high=200),
                  'regressor__max_features': randint(low=3, high=8),
                  'regressor__max_depth': [10, 15, 20, None],
                  'regressor__min_samples_split': [2, 5, 10],
                  'regressor__bootstrap': [False]
}

rnd_search = RandomizedSearchCV(pipeline,
                                param_distributions=param_distribs,
                                n_iter=15,  # Try 15 random combinations
                                cv=5,
                                scoring='neg_mean_squared_error',
                                random_state=42
)

In [22]:
# Fit RandomizedSearchCV
rnd_search.fit(X_train, y_train)

In [23]:
# Display the best hyperparameters from RandomizedSearchCV
print("Best Hyperparameters (RandomizedSearchCV):", rnd_search.best_params_)

Best Hyperparameters (RandomizedSearchCV): {'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


In [24]:
# Evaluate on test data with the best model from RandomizedSearchCV
y_test_predictions_rnd = rnd_search.best_estimator_.predict(X_test)
final_mse_rnd = mean_squared_error(y_test, y_test_predictions_rnd)
final_rmse_rnd = np.sqrt(final_mse_rnd)
print("Final Test RMSE (RandomizedSearchCV):", final_rmse_rnd)

# Display Mean Absolute Error
final_mae_rnd = mean_absolute_error(y_test, y_test_predictions_rnd)
print("Final Test MAE (RandomizedSearchCV):", final_mae_rnd)

Final Test RMSE (RandomizedSearchCV): 1.3963755091382017
Final Test MAE (RandomizedSearchCV): 1.250947716513506


In [25]:
# Display each combination's performance from RandomizedSearchCV
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1.4187184613791313 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 6, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 64}
1.4176709178640814 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 7, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 152}
1.4277209762277594 {'regressor__bootstrap': False, 'regressor__max_depth': 15, 'regressor__max_features': 5, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 124}
1.4192759284689425 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 7, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 199}
1.43366243468862 {'regressor__bootstrap': False, 'regressor__max_depth': 10, 'regressor__max_features': 4, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 87}
1.4239562458275559 {'regressor__bootstrap': False, 'regressor__max_depth': 15, 'regressor__max_features': 6, 'regressor__m

In [26]:
# Get feature importances from the best model if available
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    feature_importances = best_model.named_steps['regressor'].feature_importances_
    encoded_columns = best_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_columns)
    importance_df = pd.DataFrame(sorted(zip(feature_importances, encoded_columns), reverse=True), columns=["Importance", "Feature"])
    print(importance_df.head(10))  # Display top 10 most important features

   Importance                                            Feature
0    0.014108                             COLLISION_TYPE_12. Ped
1    0.012306          COLLISION_TYPE_9. Fixed Object - Off Road
2    0.012202                            COLLISION_TYPE_10. Deer
3    0.011663        ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
4    0.011409                         COLLISION_TYPE_1. Rear End
5    0.011013                                 SPEED_NOTSPEED_Yes
6    0.010821  ROADWAY_DESCRIPTION_3. Two-Way, Divided, Posit...
7    0.010468                     VDOT_DISTRICT_5. Hampton Roads
8    0.010273              ROADWAY_ALIGNMENT_1. Straight - Level
9    0.010255                            LIGHT_CONDITION_3. Dusk


# Summary of results:
**Intial cross-validation RMSE Scores (before tuning)**

[1.592, 1.474, 1.492, 1.566, 1.389, 1.498, 1.534, 1.520, 1.512, 1.351]

*  Mean RMSE: 1.493

---



**Best Hyperparamters form GridSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 3, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}

---



**Final test RMSE and MAE from GridSearchCV**

* Final Test RMSE: 1.396
* Final Test MAE: 1.260


---

**Best Hyperparameters from RandomizedSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


---

**Final test RMSE and MAE from RandomizedSearchCV**

*   Final Test RMSE: 1.405
*   Final Test MAE: 1.259


---

 **Feature Importances (Top 10 Features)**

1. Importance: 0.013178 | Feature: COLLISION_TYPE_12. Ped
2. Importance: 0.012126 | Feature: COLLISION_TYPE_9. Fixed Object - Off Road
3. Importance: 0.011911 | Feature: ROADWAY_SURFACE_COND_2. Wet
4. Importance: 0.011283 | Feature: ROADWAY_DESCRIPTION_3. Two-Way, Divided, Positive
5. Importance: 0.011151 | Feature: ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
6. Importance: 0.010915 | Feature: COLLISION_TYPE_1. Rear End
7. Importance: 0.010695 | Feature: VDOT_DISTRICT_5. Hampton Roads
8. Importance: 0.010624 | Feature: COLLISION_TYPE_10. Deer
9. Importance: 0.010613 | Feature: LIGHT_CONDITION_2. Daylight
10. Importance: 0.010547 | Feature: ROADWAY_ALIGNMENT_1. Straight - Level






In [27]:
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import randint

# Select important categorical features
important_features = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "ROADWAY_DESCRIPTION","VDOT_DISTRICT", "ROADWAY_ALIGNMENT"]


# Preprocessor for categorical features
categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), important_features)
    ]
)

In [28]:
# Create the full pipeline
pipeline = Pipeline([
    ('cat_preprocessor', categorical_preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, subsample=0.9))
])

In [29]:
# Parameter distribution for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__max_depth': randint(3, 7),
    'classifier__min_samples_split': randint(50, 150),
    'classifier__min_samples_leaf': randint(20, 50)
}

# Setup RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=3,
    random_state=42
)

In [30]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Time the fitting process
start_time = time.time()
rnd_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Display results
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print("Best Parameters:", rnd_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(rnd_search.best_score_))
print("Test Set Score: {:.2f}".format(rnd_search.score(X_test, y_test)))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


KeyboardInterrupt: 

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV, train_test_split

# # Step 1: Define the preprocessing step
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
#     ]
# )

# # Step 2: Preprocess X_train before SMOTE
# X_train_encoded = preprocessor.fit_transform(X_train)


In [None]:
# # Oversample with SMOTE
# sm = SMOTE(random_state=42)
# X_resampled, y_resampled = sm.fit_resample(X_train_encoded, y_train)

In [None]:
# # Define Gradient Boosting pipeline
# gb_pipeline = Pipeline([
#     ('classifier', GradientBoostingClassifier(random_state=42))
# ])

In [None]:
# # Hyperparameter grid for Gradient Boosting
# gb_param_grid = {
#     'classifier__n_estimators': [100, 500],
#     'classifier__learning_rate': [0.01, 0.1],
#     'classifier__max_depth': [3, 5],
#     'classifier__subsample': [0.8, 1.0],
#     'classifier__min_samples_split': [2, 10],
#     'classifier__min_samples_leaf': [1, 5]
# }

# # Perform grid search
# gb_random_search = RandomizedSearchCV(
#     gb_pipeline,
#     gb_param_grid,
#     n_iter=30,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

In [None]:
# # Train Gradient Boosting with progress logging
# gb_random_search.fit(X_resampled, y_resampled)

In [None]:
# # Display the best hyperparameters from RandomizedSearchCV
# print("Best Parameters (Gradient Boosting):", gb_random_search.best_params_)

In [None]:
# # Predictions and evaluation for Gradient Boosting
# X_test_encoded = preprocessor.transform(X_test)
# best_model = gb_random_search.best_estimator_
# y_pred = best_model.predict(X_test_encoded)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

In [None]:
# print(np.unique(y))  # Check unique labels in your target variable
# #print(X_seq.shape)  # Ensure it's not too sparse or imbalanced
# print(np.bincount(y_seq))
# y_train = np.array(y_train)  # Ensure this is a flat array, not 2D
# y_test = np.array(y_test)    # Ensure this is a flat array as well

In [None]:
# import tensorflow as tf
# print(tf.__version__)


In [None]:
# !pip install --upgrade tensorflow

In [None]:
# print(tf.__version__)

In [31]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
# import pandas as pd
# import numpy as np
# import requests
# import geopandas as gpd
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Dropout
# from scikeras.wrappers import KerasRegressor
# from scipy.stats import randint

# # Set seed for reproducibility
# np.random.seed(42)

# # Define the API URL
# url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
# response = requests.get(url)

# # Check if the request was successful
# if response.status_code == 200:
#     print("Data fetched successfully!")
#     # Load the GeoJSON data into a GeoDataFrame
#     geo_data = gpd.read_file(response.text)
#     # Convert to DataFrame if you don’t need geometry
#     df = pd.DataFrame(geo_data.drop(columns='geometry'))
#     print("Data successfully converted to DataFrame!")
# else:
#     print(f"Failed to fetch data. Status code: {response.status_code}")

# # Filter the DataFrame
# df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY", "COLLISION_TYPE", "WEATHER_CONDITION",
#          "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#          "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE", "SCHOOL_ZONE",
#          "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS", "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT",
#          "AREA_TYPE", "RTE_NM"]]

# # Define target and feature columns
# target_column = "CRASH_SEVERITY"
# categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
#                        "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#                        "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
#                        "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
#                        "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

# # Split data into features (X) and target (y)
# X = df[categorical_columns]
# y = df[target_column]

# # Encode the target variable
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Further split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Preprocessing for categorical features
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
# ])

# # Define a function to create the LSTM model
# def create_lstm_model(units=50, dropout_rate=0.2, optimizer='adam'):
#     model = Sequential()
#     model.add(LSTM(units=units, activation='relu', input_shape=(X_train.shape[1], 1)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # Wrap the model for use in scikit-learn
# model = KerasRegressor(build_fn=create_lstm_model, verbose=0)

# # Preprocess the data
# X_train_processed = preprocessor.fit_transform(X_train)
# X_test_processed = preprocessor.transform(X_test)

# # Reshape the input data for LSTM
# X_train_reshaped = X_train_processed.reshape((X_train_processed.shape[0], X_train_processed.shape[1], 1))
# X_test_reshaped = X_test_processed.reshape((X_test_processed.shape[0], X_test_processed.shape[1], 1))

# # Parameter distribution for RandomizedSearchCV
# param_dist = {
#     'units': [50, 100, 150],
#     'dropout_rate': [0.1, 0.2, 0.3],
#     'batch_size': [16, 32],
#     'epochs': [50, 100],
#     'optimizer': ['adam', 'rmsprop']  # Removed the 'model__' prefix
# }

# # Setup RandomizedSearchCV
# rnd_search = RandomizedSearchCV(
#     model,
#     param_distributions=param_dist,
#     n_iter=5,
#     cv=3,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
#     verbose=1,
#     random_state=42
# )

# # Fit RandomizedSearchCV
# rnd_search.fit(X_train_reshaped, y_train)

# # Display results
# print("Best Parameters:", rnd_search.best_params_)
# print("Best Cross-validation Score:", rnd_search.best_score_)

# # Evaluate on test data with the best model from RandomizedSearchCV
# best_model = rnd_search.best_estimator_
# test_score = best_model.score(X_test_reshaped, y_test)
# print("Test Set Score:", test_score)

In [None]:
# import pandas as pd
# import numpy as np
# import requests
# import geopandas as gpd
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Dropout
# from scikeras.wrappers import KerasRegressor
# # from keras.wrappers.scikit_learn import KerasRegressor
# from scipy.stats import randint

# # Set seed for reproducibility
# # np.random.seed(42)

# # # Define the API URL
# # url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
# # response = requests.get(url)

# # # Check if the request was successful
# # if response.status_code == 200:
# #     print("Data fetched successfully!")
# #     # Load the GeoJSON data into a GeoDataFrame
# #     geo_data = gpd.read_file(response.text)
# #     # Convert to DataFrame if you don’t need geometry
# #     df = pd.DataFrame(geo_data.drop(columns='geometry'))
# #     print("Data successfully converted to DataFrame!")
# # else:
# #     print(f"Failed to fetch data. Status code: {response.status_code}")

# # Filter the DataFrame
# df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY", "COLLISION_TYPE", "WEATHER_CONDITION",
#          "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#          "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE", "SCHOOL_ZONE",
#          "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS", "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT",
#          "AREA_TYPE", "RTE_NM"]]

# # Define target and feature columns
# target_column = "CRASH_SEVERITY"
# categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
#                        "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#                        "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
#                        "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
#                        "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

# # Split data into features (X) and target (y)
# X = df[categorical_columns]
# y = df[target_column]

# # Encode the target variable
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Further split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Preprocessing for categorical features
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
# ])

# # Define a function to create the LSTM model
# def create_lstm_model(units=50, dropout_rate=0.2, optimizer='adam'):
#     model = Sequential()
#     model.add(LSTM(units=units, activation='relu', input_shape=(X_train.shape[1], 1)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # Wrap the model for use in scikit-learn
# model = KerasRegressor(build_fn=create_lstm_model, verbose=0)

# # Reshape the input data for LSTM
# X_train_reshaped = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
# X_test_reshaped = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))

# # Parameter distribution for RandomizedSearchCV
# # param_dist = {
# #     'units': [50, 100, 150],
# #     'dropout_rate': [0.1, 0.2, 0.3],
# #     'batch_size': [16, 32],
# #     'epochs': [50, 100]
# # }

# param_dist = {
#     'model__units': [50, 100, 150], # Changed 'units' to 'model__units'
#     'model__dropout_rate': [0.1, 0.2, 0.3], # Changed 'dropout_rate' to 'model__dropout_rate'
#     'batch_size': [16, 32],
#     'epochs': [50, 100],
#     'model__optimizer': ['adam', 'rmsprop'] # Added optimizer to the search space
# }

# # Setup RandomizedSearchCV
# rnd_search = RandomizedSearchCV(
#     model,
#     param_distributions=param_dist,
#     n_iter=5,
#     cv=3,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
#     verbose=1,
#     random_state=42
# )

# # Fit RandomizedSearchCV
# rnd_search.fit(X_train_reshaped, y_train)

# # Display results
# print("Best Parameters:", rnd_search.best_params_)
# print("Best Cross-validation Score:", rnd_search.best_score_)

# # Evaluate on test data with the best model from RandomizedSearchCV
# best_model = rnd_search.best_estimator_
# test_score = best_model.score(X_test_reshaped, y_test)
# print("Test Set Score:", test_score)

In [None]:
# import time
# import numpy as np
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout
# from tensorflow.keras.optimizers import Adam

# # Create an LSTM model
# def create_lstm_model(input_shape, lstm_units, dropout_rate, learning_rate):
#     model = Sequential([
#         LSTM(lstm_units, input_shape=input_shape, return_sequences=False),
#         Dropout(dropout_rate),
#         Dense(1, activation='sigmoid')  # Assuming binary classification
#     ])
#     model.compile(optimizer=Adam(learning_rate=learning_rate),
#                   loss='binary_crossentropy',
#                   metrics=['accuracy'])
#     return model

# # Sample parameter grid
# param_grid = {
#     'lstm_units': [50, 100],
#     'dropout_rate': [0.2, 0.5],
#     'learning_rate': [0.001, 0.01]
# }

# # Example data (replace with your dataset)
# X = np.random.rand(1000, 10, 5)  # 1000 samples, 10 time steps, 5 features
# y = np.random.randint(0, 2, 1000)  # Binary classification labels

# # Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Time the fitting process
# start_time = time.time()

# best_score = 0
# best_params = None
# best_model = None

# # Grid Search
# for lstm_units in param_grid['lstm_units']:
#     for dropout_rate in param_grid['dropout_rate']:
#         for learning_rate in param_grid['learning_rate']:
#             # Build the model
#             model = create_lstm_model(input_shape=(X_train.shape[1], X_train.shape[2]),
#                                       lstm_units=lstm_units,
#                                       dropout_rate=dropout_rate,
#                                       learning_rate=learning_rate)

#             # Train the model
#             model.fit(X_train, y_train, epochs=5, batch_size=32, verbose=0)  # Adjust epochs as needed

#             # Evaluate on validation data (use train_test_split for a separate validation set if required)
#             val_loss, val_accuracy = model.evaluate(X_train, y_train, verbose=0)

#             # Track the best model
#             if val_accuracy > best_score:
#                 best_score = val_accuracy
#                 best_params = {
#                     'lstm_units': lstm_units,
#                     'dropout_rate': dropout_rate,
#                     'learning_rate': learning_rate
#                 }
#                 best_model = model

# elapsed_time = time.time() - start_time

# # Evaluate the best model on the test set
# test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)

# # Display results
# print(f"Elapsed time: {elapsed_time:.2f} seconds")
# print("Best Parameters:", best_params)
# print(f"Best Cross-validation Score: {best_score:.2f}")
# print(f"Test Set Score: {test_accuracy:.2f}")


In [None]:
# import pandas as pd
# import numpy as np
# import requests
# import geopandas as gpd
# from sklearn.model_selection import train_test_split, RandomizedSearchCV
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Dropout, Reshape
# from scikeras.wrappers import KerasRegressor
# from imblearn.over_sampling import SMOTE
# from scipy.stats import randint

# # Custom transformer to reshape the data
# class ReshapeTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         # Reshape to 3D for LSTM
#         return X.reshape((X.shape[0], 1, X.shape[1]))

# # Set seed for reproducibility
# # np.random.seed(42)

# # # Define the API URL
# # url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
# # response = requests.get(url)

# # # Check if the request was successful
# # if response.status_code == 200:
# #     print("Data fetched successfully!")
# #     # Load the GeoJSON data into a GeoDataFrame
# #     geo_data = gpd.read_file(response.text)
# #     # Convert to DataFrame if you don’t need geometry
# #     df = pd.DataFrame(geo_data.drop(columns='geometry'))
# #     print("Data successfully converted to DataFrame!")
# # else:
# #     print(f"Failed to fetch data. Status code: {response.status_code}")

# # Filter the DataFrame
# df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY", "COLLISION_TYPE", "WEATHER_CONDITION",
#          "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#          "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE", "SCHOOL_ZONE",
#          "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS", "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT",
#          "AREA_TYPE", "RTE_NM"]]

# # Define target and feature columns
# target_column = "CRASH_SEVERITY"
# categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
#                        "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
#                        "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
#                        "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
#                        "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

# # Split data into features (X) and target (y)
# X = df[categorical_columns]
# y = df[target_column]

# # Encode the target variable
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # Further split data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # Preprocessing for categorical features
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
# ])

# # Define a function to create the LSTM model
# def create_lstm_model(units=50, dropout_rate=0.2, optimizer='adam'):
#     model = Sequential()
#     # The input shape should be (1, number_of_features) since we will reshape it
#     model.add(Reshape((1, X_train.shape[1])))  # Reshape layer to convert 2D input to 3D
#     model.add(LSTM(units=units, activation='relu', input_shape=(1, X_train.shape[1])))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(1, activation='sigmoid'))  # Sigmoid for binary classification
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # Wrap the model for use in scikit-learn
# model = KerasRegressor(build_fn=create_lstm_model, verbose=0)

# # Define the pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('oversample', SMOTE(random_state=12)),  # Apply SMOTE for class balancing
#     ('reshape', ReshapeTransformer()),  # Reshape after oversampling
#     ('model', model)
# ])

# # Parameter distribution for RandomizedSearchCV
# param_dist = {
#     'model__units': [50, 100, 150],
#     'model__dropout_rate': [0.1, 0.2, 0.3],
#     'model__batch_size': [16, 32],
#     'model__epochs': [50, 100],
#     'model__optimizer': ['adam', 'rmsprop']  # Removed the 'model__' prefix
# }

# # Setup RandomizedSearchCV
# rnd_search = RandomizedSearchCV(
#     pipeline,
#     param_distributions=param_dist,
#     n_iter=5,
#     cv=3,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1,
#     verbose=1,
#     random_state=42
# )

# # Fit RandomizedSearchCV
# rnd_search.fit(X_train, y_train)  # Pass the original X_train (2D)

# # Display results
# print("Best Parameters:", rnd_search.best_params_)
# print("Best Cross-validation Score:", rnd_search.best_score_)

# # Evaluate on test data with the best model from RandomizedSearchCV
# best_model = rnd_search.best_estimator_
# test_score = best_model.score(X_test, y_test)  # Pass the original X_test (2D)
# print("Test Set Score:", test_score)

In [33]:
# import pandas as pd
# import numpy as np
# import requests
# import geopandas as gpd
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.base import BaseEstimator, TransformerMixin
# from keras.models import Sequential
# from keras.layers import LSTM, Dense, Dropout
# from keras import backend as K
# from scikeras.wrappers import KerasRegressor

# # Custom transformer to reshape data (if needed, but we'll skip for now)
# class ReshapeTransformer(BaseEstimator, TransformerMixin):
#     def fit(self, X, y=None):
#         return self

#     def transform(self, X):
#         return X.reshape((X.shape[0], 1, X.shape[1]))

# # Preprocessing for categorical features
# preprocessor = ColumnTransformer([
#     ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
# ])

# label_encoder = LabelEncoder()

# # Fit and transform the labels
# y_train_encoded = label_encoder.fit_transform(y_train)

# # If needed, you can also transform y_test the same way
# y_test_encoded = label_encoder.transform(y_test)

# # Function to create the LSTM model
# def create_lstm_model(neurons1=50, neurons2=30, neurons3=10, optimizer='adam', activation='relu'):
#     K.clear_session()
#     model = Sequential()
#     model.add(LSTM(units=neurons1, input_shape=(1, X_train.shape[1]), activation=activation))
#     model.add(Dropout(0.2))
#     model.add(Dense(units=neurons2, activation=activation))
#     model.add(Dropout(0.2))
#     model.add(Dense(units=neurons3, activation=activation))
#     model.add(Dense(1, activation='sigmoid'))
#     model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# # Wrap the model for use with scikit-learn
# model = KerasRegressor(
#     model=create_lstm_model,
#     verbose=0,
#     neurons1=50,
#     neurons2=30,
#     neurons3=10,
#     optimizer="adam",
#     activation="relu",
#     batch_size=16,
#     epochs=50
# )

# # Define pipeline with preprocessing and model
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('model', model)
# ])

# # Parameter grid for GridSearchCV
# param_grid = {
#     'model__neurons1': [50, 100, 150],
#     'model__neurons2': [30, 60],
#     'model__neurons3': [10, 20],
#     'model__optimizer': ['adam', 'rmsprop'],
#     'model__activation': ['relu', 'tanh'],
#     'model__batch_size': [16, 32],
#     'model__epochs': [50, 100]
# }

# # Setup GridSearchCV
# grid_search = GridSearchCV(
#     estimator=pipeline,
#     param_grid=param_grid,
#     scoring='accuracy',
#     cv=3,
#     verbose=1,
#     n_jobs=-1
# )

# # Fit GridSearchCV
# grid_search.fit(X_train, y_train_encoded)

# # Display results
# print("Best Parameters:", grid_search.best_params_)
# print("Best Cross-validation Score:", grid_search.best_score_)

# # Evaluate on test data
# best_model = grid_search.best_estimator_
# test_score = best_model.score(X_test, y_test)
# print("Test Set Score:", test_score)


Fitting 3 folds for each of 192 candidates, totalling 576 fits




ValueError: 
All the 576 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
192 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 938, in _fit
    self._fit_keras_model(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 535, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/local/lib/python3.10/dist-packages/keras/src/models/functional.py", line 244, in _adjust_input_rank
    raise ValueError(
ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 745), dtype=float32). Expected shape (None, 1, 17), but input has incompatible shape (None, 745)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 745), dtype=float32)
  • training=True
  • mask=None

--------------------------------------------------------------------------------
192 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 938, in _fit
    self._fit_keras_model(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 535, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/local/lib/python3.10/dist-packages/keras/src/models/functional.py", line 244, in _adjust_input_rank
    raise ValueError(
ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 732), dtype=float32). Expected shape (None, 1, 17), but input has incompatible shape (None, 732)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 732), dtype=float32)
  • training=True
  • mask=None

--------------------------------------------------------------------------------
192 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 770, in fit
    self._fit(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 938, in _fit
    self._fit_keras_model(
  File "/usr/local/lib/python3.10/dist-packages/scikeras/wrappers.py", line 535, in _fit_keras_model
    hist = self.model_.fit(x=X, y=y, **fit_args)
  File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 122, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/usr/local/lib/python3.10/dist-packages/keras/src/models/functional.py", line 244, in _adjust_input_rank
    raise ValueError(
ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(None, 736), dtype=float32). Expected shape (None, 1, 17), but input has incompatible shape (None, 736)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(None, 736), dtype=float32)
  • training=True
  • mask=None


In [37]:
import time
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Create an LSTM model
def create_lstm_model(input_shape, lstm_units, dropout_rate, learning_rate):
    model = Sequential([
        LSTM(lstm_units, input_shape=input_shape, return_sequences=False),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')  # Assuming binary classification
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

# Sample parameter grid
param_grid = {
    'lstm_units': [32, 64, 128],
    'dropout_rate': [0.2, 0.3, 0.5],
    'learning_rate': [0.001, 0.005, 0.01],
    'batch_size': [32, 64]
}

# Example data (replace with your dataset)
X = np.random.rand(1000, 10, 5)  # 1000 samples, 10 time steps, 5 features
y = np.random.randint(0, 2, 1000)  # Binary classification labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Time the fitting process
start_time = time.time()

best_score = 0
best_params = None
best_model = None

# Grid Search
# Expanded Grid Search without Validation Set
for lstm_units in param_grid['lstm_units']:
    for dropout_rate in param_grid['dropout_rate']:
        for learning_rate in param_grid['learning_rate']:
            for batch_size in param_grid['batch_size']:
                # Create the LSTM model
                model = create_lstm_model(
                    input_shape=(X_train.shape[1], X_train.shape[2]),
                    lstm_units=lstm_units,
                    dropout_rate=dropout_rate,
                    learning_rate=learning_rate
                )

                # Train the model
                model.fit(X_train, y_train,
                          epochs=50,  # Increase if needed
                          batch_size=batch_size,
                          verbose=0)  # Suppress detailed training output

                # Evaluate the model on the test set
                test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

                # Track the best model based on test accuracy
                if test_accuracy > best_score:
                    best_score = test_accuracy
                    best_params = {
                        'lstm_units': lstm_units,
                        'dropout_rate': dropout_rate,
                        'learning_rate': learning_rate,
                        'batch_size': batch_size
                    }
                    best_model = model


elapsed_time = time.time() - start_time

# Evaluate the best model on the test set
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)

# Display results
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print("Best Parameters:", best_params)
print(f"Best Cross-validation Score: {best_score:.2f}")
print(f"Test Set Score: {test_accuracy:.2f}")


Elapsed time: 270.59 seconds
Best Parameters: {'lstm_units': 64, 'dropout_rate': 0.5, 'learning_rate': 0.001, 'batch_size': 64}
Best Cross-validation Score: 0.57
Test Set Score: 0.57
