<a href="https://colab.research.google.com/github/anissapatel/ML4VA/blob/main/ML4VA_Traffic_Patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [113]:
!git clone https://github.com/anissapatel/ML4VA.git

fatal: destination path 'ML4VA' already exists and is not an empty directory.


In [114]:
!pip install geopandas



In [115]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import randint
import requests
import geopandas as gpd
import numpy as np
np.random.seed(42)

In [116]:
# Define the API URL
url = "https://services.arcgis.com/p5v98VHDX9Atv3l7/arcgis/rest/services/CrashData_test/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json"
response = requests.get(url)

In [117]:
# Check if the request was successful
if response.status_code == 200:
    print("Data fetched successfully!")
    # Load the GeoJSON data into a GeoDataFrame
    geo_data = gpd.read_file(response.text)
    # Convert to DataFrame if you don’t need geometry
    df = pd.DataFrame(geo_data.drop(columns='geometry'))
    print("Data successfully converted to DataFrame!")
else:
    print(f"Failed to fetch data. Status code: {response.status_code}")

df.head()

Data fetched successfully!
Data successfully converted to DataFrame!


Unnamed: 0,OBJECTID,DOCUMENT_NBR,CRASH_YEAR,CRASH_DT,CRASH_MILITARY_TM,CRASH_SEVERITY,K_PEOPLE,A_PEOPLE,B_PEOPLE,C_PEOPLE,...,AREA_TYPE,SYSTEM,VSP,OWNERSHIP,PLAN_DISTRICT,MPO_NAME,RTE_NM,RNS_MP,NODE,OFFSET
0,1,163465085,2016,2016-12-06 05:00:00+00:00,700,O,0,0,0,0,...,Urban,VDOT Interstate,7,1. State Hwy Agency,Northern Virginia,NOVA,R-VA IS00095NB,158.85,,
1,2,160725125,2016,2016-01-26 05:00:00+00:00,1636,O,0,0,0,0,...,Urban,NonVDOT secondary,5,3. City or Town Hwy Agency,Hampton Roads,HAMP,S-VA114NP WOODLAND RD,0.5,253154.0,318.32
2,3,160465260,2016,2016-02-13 05:00:00+00:00,1845,B,0,0,2,0,...,Rural,VDOT Secondary,1,1. State Hwy Agency,Northern Neck,,R-VA066SC00640NB,2.54,1149141.0,5.82
3,4,161255113,2016,2016-03-26 04:00:00+00:00,1,O,0,0,0,0,...,Urban,VDOT Secondary,1,1. State Hwy Agency,Richmond Regional,RICH,R-VA042SC00782SB,1.19,,
4,5,162005180,2016,2016-07-18 04:00:00+00:00,732,O,0,0,0,0,...,Rural,VDOT Primary,6,1. State Hwy Agency,"Roanoke Valley-Alleghany, West Piedmont",,R-VA US00220NB,48.61,328737.0,5.34


In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 67 columns):
 #   Column                    Non-Null Count  Dtype              
---  ------                    --------------  -----              
 0   OBJECTID                  2000 non-null   int32              
 1   DOCUMENT_NBR              2000 non-null   int32              
 2   CRASH_YEAR                2000 non-null   object             
 3   CRASH_DT                  2000 non-null   datetime64[ms, UTC]
 4   CRASH_MILITARY_TM         2000 non-null   object             
 5   CRASH_SEVERITY            2000 non-null   object             
 6   K_PEOPLE                  2000 non-null   int16              
 7   A_PEOPLE                  2000 non-null   int16              
 8   B_PEOPLE                  2000 non-null   int16              
 9   C_PEOPLE                  2000 non-null   int16              
 10  PERSONS_INJURED           2000 non-null   int16              
 11  PEDESTRIANS_KILLE

In [119]:
df = df[["CRASH_YEAR", "CRASH_DT", "CRASH_SEVERITY","COLLISION_TYPE", "WEATHER_CONDITION",
         "LIGHT_CONDITION","ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY","ROADWAY_ALIGNMENT",
         "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION","INTERSECTION_TYPE","SCHOOL_ZONE",
         "SPEED_NOTSPEED","INTERSECTION_ANALYSIS", "MAINLINE_YN","NIGHT","VDOT_DISTRICT",
         "AREA_TYPE","RTE_NM"]]
df.head()

Unnamed: 0,CRASH_YEAR,CRASH_DT,CRASH_SEVERITY,COLLISION_TYPE,WEATHER_CONDITION,LIGHT_CONDITION,ROADWAY_SURFACE_COND,RELATION_TO_ROADWAY,ROADWAY_ALIGNMENT,ROADWAY_DEFECT,ROADWAY_DESCRIPTION,INTERSECTION_TYPE,SCHOOL_ZONE,SPEED_NOTSPEED,INTERSECTION_ANALYSIS,MAINLINE_YN,NIGHT,VDOT_DISTRICT,AREA_TYPE,RTE_NM
0,2016,2016-12-06 05:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"3. Two-Way, Divided, Positive Median Barrier",1. Not at Intersection,3. No,Yes,Not Intersection,Yes,Yes,9. Northern Virginia,Urban,R-VA IS00095NB
1,2016,2016-01-26 05:00:00+00:00,O,2. Angle,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,1. Main-Line Roadway,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,No,5. Hampton Roads,Urban,S-VA114NP WOODLAND RD
2,2016,2016-02-13 05:00:00+00:00,B,9. Fixed Object - Off Road,1. No Adverse Condition (Clear/Cloudy),4. Darkness - Road Lighted,1. Dry,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,VDOT Intersection,Yes,Yes,6. Fredericksburg,Rural,R-VA066SC00640NB
3,2016,2016-03-26 04:00:00+00:00,O,1. Rear End,5. Rain,5. Darkness - Road Not Lighted,2. Wet,8. Non-Intersection,1. Straight - Level,1. No Defects,"1. Two-Way, Not Divided",1. Not at Intersection,3. No,No,Not Intersection,Yes,Yes,4. Richmond,Urban,R-VA042SC00782SB
4,2016,2016-07-18 04:00:00+00:00,O,1. Rear End,1. No Adverse Condition (Clear/Cloudy),2. Daylight,1. Dry,9. Within Intersection,3. Grade - Straight,1. No Defects,"2. Two-Way, Divided, Unprotected Median",3. Three Approaches,3. No,No,VDOT Intersection,Yes,No,2. Salem,Rural,R-VA US00220NB


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   CRASH_YEAR             2000 non-null   object             
 1   CRASH_DT               2000 non-null   datetime64[ms, UTC]
 2   CRASH_SEVERITY         2000 non-null   object             
 3   COLLISION_TYPE         2000 non-null   object             
 4   WEATHER_CONDITION      2000 non-null   object             
 5   LIGHT_CONDITION        2000 non-null   object             
 6   ROADWAY_SURFACE_COND   2000 non-null   object             
 7   RELATION_TO_ROADWAY    2000 non-null   object             
 8   ROADWAY_ALIGNMENT      2000 non-null   object             
 9   ROADWAY_DEFECT         2000 non-null   object             
 10  ROADWAY_DESCRIPTION    2000 non-null   object             
 11  INTERSECTION_TYPE      2000 non-null   object           

In [121]:
# Step 3: Define target and feature columns
target_column = "CRASH_SEVERITY"
categorical_columns = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION",
                       "ROADWAY_SURFACE_COND", "RELATION_TO_ROADWAY", "ROADWAY_ALIGNMENT",
                       "ROADWAY_DEFECT", "ROADWAY_DESCRIPTION", "INTERSECTION_TYPE",
                       "SCHOOL_ZONE", "SPEED_NOTSPEED", "INTERSECTION_ANALYSIS",
                       "MAINLINE_YN", "NIGHT", "VDOT_DISTRICT", "AREA_TYPE", "RTE_NM"]

In [122]:
# Split data into features (X) and target (y)
X = df[categorical_columns]
y = df[target_column]

In [123]:
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [124]:
# Further split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [125]:
# Step 4: Define preprocessing and model pipeline
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns)
])

In [126]:
# Define the pipeline with preprocessing and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [127]:
# Step 5: Initial Model Evaluation using Cross-Validation
cross_val_scores = cross_val_score(pipeline, X_train, y_train, scoring="neg_mean_squared_error", cv=10, error_score='raise')
initial_rmse_scores = np.sqrt(-cross_val_scores)
print("Initial Cross-Validation RMSE Scores:", initial_rmse_scores)
print("Mean RMSE:", initial_rmse_scores.mean())

Initial Cross-Validation RMSE Scores: [1.59203487 1.4726939  1.49702261 1.5625711  1.38888185 1.49907511
 1.53438176 1.51618302 1.51162677 1.3523169 ]
Mean RMSE: 1.4926787870736444


In [128]:
# Step 6: Hyperparameter tuning with GridSearchCV
param_grid = {'regressor__n_estimators': [50, 100, 150],
              'regressor__max_features': [3, 5, 7],
              'regressor__max_depth': [10, 15, 20, None],  # Add max depth to control overfitting
              'regressor__min_samples_split': [2, 5, 10],   # Minimum samples required to split
              'regressor__bootstrap': [False]
}

grid_search = GridSearchCV(pipeline,
                           param_grid,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True
)

In [129]:
# Fit GridSearchCV
grid_search.fit(X_train, y_train)

In [130]:
# Display the best hyperparameters and best model
print("Best Hyperparameters (GridSearchCV):", grid_search.best_params_)
best_model = grid_search.best_estimator_

Best Hyperparameters (GridSearchCV): {'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 7, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}


In [131]:
# Step 7: Model evaluation on test set
y_test_predictions = best_model.predict(X_test)
final_mse = mean_squared_error(y_test, y_test_predictions)
final_rmse = np.sqrt(final_mse)
print("Final Test RMSE (GridSearchCV):", final_rmse)

# Mean Absolute Error (optional)
final_mae = mean_absolute_error(y_test, y_test_predictions)
print("Final Test MAE (GridSearchCV):", final_mae)

Final Test RMSE (GridSearchCV): 1.3940779185004748
Final Test MAE (GridSearchCV): 1.244900324074074


In [132]:
# View GridSearchCV results in a DataFrame
results_df = pd.DataFrame(grid_search.cv_results_)
print(results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(by="mean_test_score", ascending=False))

                                                params  mean_test_score  \
107  {'regressor__bootstrap': False, 'regressor__ma...        -1.993134   
89   {'regressor__bootstrap': False, 'regressor__ma...        -1.994950   
106  {'regressor__bootstrap': False, 'regressor__ma...        -1.995982   
88   {'regressor__bootstrap': False, 'regressor__ma...        -1.999102   
97   {'regressor__bootstrap': False, 'regressor__ma...        -1.999872   
..                                                 ...              ...   
91   {'regressor__bootstrap': False, 'regressor__ma...        -2.129761   
100  {'regressor__bootstrap': False, 'regressor__ma...        -2.142446   
90   {'regressor__bootstrap': False, 'regressor__ma...        -2.146812   
81   {'regressor__bootstrap': False, 'regressor__ma...        -2.148881   
99   {'regressor__bootstrap': False, 'regressor__ma...        -2.162415   

     std_test_score  
107        0.109617  
89         0.103363  
106        0.107436  
88         

In [133]:
# Step 8: Further tuning with RandomizedSearchCV
param_distribs = {'regressor__n_estimators': randint(low=50, high=200),
                  'regressor__max_features': randint(low=3, high=8),
                  'regressor__max_depth': [10, 15, 20, None],
                  'regressor__min_samples_split': [2, 5, 10],
                  'regressor__bootstrap': [False]
}

rnd_search = RandomizedSearchCV(pipeline,
                                param_distributions=param_distribs,
                                n_iter=15,  # Try 15 random combinations
                                cv=5,
                                scoring='neg_mean_squared_error',
                                random_state=42
)

In [134]:
# Fit RandomizedSearchCV
rnd_search.fit(X_train, y_train)

In [135]:
# Display the best hyperparameters from RandomizedSearchCV
print("Best Hyperparameters (RandomizedSearchCV):", rnd_search.best_params_)

Best Hyperparameters (RandomizedSearchCV): {'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


In [136]:
# Evaluate on test data with the best model from RandomizedSearchCV
y_test_predictions_rnd = rnd_search.best_estimator_.predict(X_test)
final_mse_rnd = mean_squared_error(y_test, y_test_predictions_rnd)
final_rmse_rnd = np.sqrt(final_mse_rnd)
print("Final Test RMSE (RandomizedSearchCV):", final_rmse_rnd)

# Display Mean Absolute Error
final_mae_rnd = mean_absolute_error(y_test, y_test_predictions_rnd)
print("Final Test MAE (RandomizedSearchCV):", final_mae_rnd)

Final Test RMSE (RandomizedSearchCV): 1.3963755091382017
Final Test MAE (RandomizedSearchCV): 1.250947716513506


In [137]:
# Display each combination's performance from RandomizedSearchCV
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

1.4187184613791313 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 6, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 64}
1.4176709178640814 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 7, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 152}
1.4277209762277594 {'regressor__bootstrap': False, 'regressor__max_depth': 15, 'regressor__max_features': 5, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 124}
1.4192759284689425 {'regressor__bootstrap': False, 'regressor__max_depth': 20, 'regressor__max_features': 7, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 199}
1.43366243468862 {'regressor__bootstrap': False, 'regressor__max_depth': 10, 'regressor__max_features': 4, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 87}
1.4239562458275559 {'regressor__bootstrap': False, 'regressor__max_depth': 15, 'regressor__max_features': 6, 'regressor__m

In [138]:
# Get feature importances from the best model if available
if hasattr(best_model.named_steps['regressor'], 'feature_importances_'):
    feature_importances = best_model.named_steps['regressor'].feature_importances_
    encoded_columns = best_model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_columns)
    importance_df = pd.DataFrame(sorted(zip(feature_importances, encoded_columns), reverse=True), columns=["Importance", "Feature"])
    print(importance_df.head(10))  # Display top 10 most important features

   Importance                                            Feature
0    0.014108                             COLLISION_TYPE_12. Ped
1    0.012306          COLLISION_TYPE_9. Fixed Object - Off Road
2    0.012202                            COLLISION_TYPE_10. Deer
3    0.011663        ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
4    0.011409                         COLLISION_TYPE_1. Rear End
5    0.011013                                 SPEED_NOTSPEED_Yes
6    0.010821  ROADWAY_DESCRIPTION_3. Two-Way, Divided, Posit...
7    0.010468                     VDOT_DISTRICT_5. Hampton Roads
8    0.010273              ROADWAY_ALIGNMENT_1. Straight - Level
9    0.010255                            LIGHT_CONDITION_3. Dusk


# Summary of results:
**Intial cross-validation RMSE Scores (before tuning)**

[1.592, 1.474, 1.492, 1.566, 1.389, 1.498, 1.534, 1.520, 1.512, 1.351]

*  Mean RMSE: 1.493

---



**Best Hyperparamters form GridSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 3, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 150}

---



**Final test RMSE and MAE from GridSearchCV**

* Final Test RMSE: 1.396
* Final Test MAE: 1.260


---

**Best Hyperparameters from RandomizedSearchCV**

{'regressor__bootstrap': False, 'regressor__max_depth': None, 'regressor__max_features': 6, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 57}


---

**Final test RMSE and MAE from RandomizedSearchCV**

*   Final Test RMSE: 1.405
*   Final Test MAE: 1.259


---

 **Feature Importances (Top 10 Features)**

1. Importance: 0.013178 | Feature: COLLISION_TYPE_12. Ped
2. Importance: 0.012126 | Feature: COLLISION_TYPE_9. Fixed Object - Off Road
3. Importance: 0.011911 | Feature: ROADWAY_SURFACE_COND_2. Wet
4. Importance: 0.011283 | Feature: ROADWAY_DESCRIPTION_3. Two-Way, Divided, Positive
5. Importance: 0.011151 | Feature: ROADWAY_DESCRIPTION_1. Two-Way, Not Divided
6. Importance: 0.010915 | Feature: COLLISION_TYPE_1. Rear End
7. Importance: 0.010695 | Feature: VDOT_DISTRICT_5. Hampton Roads
8. Importance: 0.010624 | Feature: COLLISION_TYPE_10. Deer
9. Importance: 0.010613 | Feature: LIGHT_CONDITION_2. Daylight
10. Importance: 0.010547 | Feature: ROADWAY_ALIGNMENT_1. Straight - Level






In [149]:
import time
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from scipy.stats import randint

# Select important categorical features
important_features = ["COLLISION_TYPE", "WEATHER_CONDITION", "LIGHT_CONDITION", "ROADWAY_SURFACE_COND", "ROADWAY_DESCRIPTION","VDOT_DISTRICT", "ROADWAY_ALIGNMENT"]


# Preprocessor for categorical features
categorical_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), important_features)
    ]
)

In [150]:
# Create the full pipeline
pipeline = Pipeline([
    ('cat_preprocessor', categorical_preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, subsample=0.9))
])

In [151]:
# Parameter distribution for RandomizedSearchCV
param_dist = {
    'classifier__n_estimators': randint(100, 300),
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__max_depth': randint(3, 7),
    'classifier__min_samples_split': randint(50, 150),
    'classifier__min_samples_leaf': randint(20, 50)
}

# Setup RandomizedSearchCV
rnd_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=3,
    random_state=42
)

In [152]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Time the fitting process
start_time = time.time()
rnd_search.fit(X_train, y_train)
elapsed_time = time.time() - start_time

# Display results
print(f"Elapsed time: {elapsed_time:.2f} seconds")
print("Best Parameters:", rnd_search.best_params_)
print("Best Cross-validation Score: {:.2f}".format(rnd_search.best_score_))
print("Test Set Score: {:.2f}".format(rnd_search.score(X_test, y_test)))

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Elapsed time: 146.08 seconds
Best Parameters: {'classifier__learning_rate': 0.05, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 31, 'classifier__min_samples_split': 107, 'classifier__n_estimators': 121}
Best Cross-validation Score: 0.63
Test Set Score: 0.65


In [140]:
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OneHotEncoder
# from imblearn.over_sampling import SMOTE
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, classification_report
# from sklearn.model_selection import GridSearchCV, train_test_split

# # Step 1: Define the preprocessing step
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)
#     ]
# )

# # Step 2: Preprocess X_train before SMOTE
# X_train_encoded = preprocessor.fit_transform(X_train)


In [141]:
# # Oversample with SMOTE
# sm = SMOTE(random_state=42)
# X_resampled, y_resampled = sm.fit_resample(X_train_encoded, y_train)

In [142]:
# # Define Gradient Boosting pipeline
# gb_pipeline = Pipeline([
#     ('classifier', GradientBoostingClassifier(random_state=42))
# ])

In [144]:
# # Hyperparameter grid for Gradient Boosting
# gb_param_grid = {
#     'classifier__n_estimators': [100, 500],
#     'classifier__learning_rate': [0.01, 0.1],
#     'classifier__max_depth': [3, 5],
#     'classifier__subsample': [0.8, 1.0],
#     'classifier__min_samples_split': [2, 10],
#     'classifier__min_samples_leaf': [1, 5]
# }

# # Perform grid search
# gb_random_search = RandomizedSearchCV(
#     gb_pipeline,
#     gb_param_grid,
#     n_iter=30,
#     cv=3,
#     scoring='accuracy',
#     n_jobs=-1,
#     verbose=3,
#     random_state=42
# )

In [145]:
# # Train Gradient Boosting with progress logging
# gb_random_search.fit(X_resampled, y_resampled)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


In [146]:
# # Display the best hyperparameters from RandomizedSearchCV
# print("Best Parameters (Gradient Boosting):", gb_random_search.best_params_)

Best Parameters (Gradient Boosting): {'classifier__subsample': 0.8, 'classifier__n_estimators': 500, 'classifier__min_samples_split': 2, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 5, 'classifier__learning_rate': 0.1}


In [148]:
# # Predictions and evaluation for Gradient Boosting
# X_test_encoded = preprocessor.transform(X_test)
# best_model = gb_random_search.best_estimator_
# y_pred = best_model.predict(X_test_encoded)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

Accuracy: 0.585
              precision    recall  f1-score   support

           0       0.14      0.05      0.07        21
           1       0.23      0.14      0.18        71
           2       0.26      0.11      0.16        44
           3       0.00      0.00      0.00         4
           4       0.66      0.84      0.74       260

    accuracy                           0.58       400
   macro avg       0.26      0.23      0.23       400
weighted avg       0.51      0.58      0.53       400

