In [1]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': None, 'classifier__estimator__min_samples_leaf': 5, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 200}

# Step 2: Define Extended Parameter Grid
param_grid_rf = {
    'classifier__estimator__n_estimators': [100, 200, 500],  # Number of trees
    'classifier__estimator__max_depth': [None, 10, 20],  # Tree depth
    'classifier__estimator__min_samples_split': [2, 5, 10],  # Min samples to split
    'classifier__estimator__min_samples_leaf': [1, 2, 5],  # Min samples per leaf
    'classifier__estimator__class_weight': [None, 'balanced']  # Handle class imbalance
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': None, 'classifier__estimator__min_samples_leaf': 5, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 500}
Best cross-validation train score: 1.00
Best cross-validation validation score: 0.64
Test-set F1-score: 0.68
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.54      0.65      0.59        55
           1       0.50      0.58      0.54        48
           2       0.71      0.80      0.75        75
           3       0.61      0.66      0.63        76
           4       0.72      0.77      0.74        73
           5       0.72      0.73      0.72        93

   micro avg       0.65      0

In [None]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# Step 2: Define Extended Parameter Grid
#  {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': None, 'classifier__estimator__min_samples_leaf': 5, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 500}

param_grid_rf = {
    'classifier__estimator__n_estimators': [100, 500, 600],  # Number of trees
    'classifier__estimator__max_depth': [10, 12, 15],  # Tree depth
    'classifier__estimator__min_samples_split': [2, 3, 5],  # Min samples to split
    'classifier__estimator__min_samples_leaf': [3, 5, 7],  # Min samples per leaf
    'classifier__estimator__class_weight': ['balanced']  # Handle class imbalance
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 100}
Best cross-validation train score: 0.96
Best cross-validation validation score: 0.65
Test-set F1-score: 0.68
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.54      0.71      0.61        55
           1       0.45      0.60      0.52        48
           2       0.68      0.79      0.73        75
           3       0.64      0.64      0.64        76
           4       0.74      0.78      0.76        73
           5       0.72      0.74      0.73        93

   micro avg       0.64      0.72

In [3]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# Step 2: Define Extended Parameter Grid
# {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 100}

param_grid_rf = {
    'classifier__estimator__n_estimators': [80, 100, 120],
    'classifier__estimator__max_depth': [8, 10, 12],
    'classifier__estimator__min_samples_split': [2, 4, 6],
    'classifier__estimator__min_samples_leaf': [6, 7, 8],
    'classifier__estimator__class_weight': ['balanced']
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 8, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 120}
Best cross-validation train score: 0.88
Best cross-validation validation score: 0.66
Test-set F1-score: 0.68
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.53      0.71      0.61        55
           1       0.46      0.60      0.52        48
           2       0.69      0.79      0.74        75
           3       0.63      0.67      0.65        76
           4       0.71      0.75      0.73        73
           5       0.72      0.72      0.72        93

   micro avg       0.64      0.71

In [4]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# Step 2: Define Extended Parameter Grid
# {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 100}

param_grid_rf = {
    'classifier__estimator__n_estimators': [110, 120, 130],
    'classifier__estimator__max_depth': [9, 10, 11],
    'classifier__estimator__min_samples_split': [2, 3, 4],
    'classifier__estimator__min_samples_leaf': [7, 8, 9],
    'classifier__estimator__class_weight': ['balanced']
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 9, 'classifier__estimator__min_samples_leaf': 8, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 120}
Best cross-validation train score: 0.85
Best cross-validation validation score: 0.66
Test-set F1-score: 0.68
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.53      0.73      0.62        55
           1       0.48      0.65      0.55        48
           2       0.69      0.79      0.73        75
           3       0.64      0.67      0.65        76
           4       0.71      0.75      0.73        73
           5       0.72      0.72      0.72        93

   micro avg       0.64      0.72 

In [5]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# Step 2: Define Extended Parameter Grid
# {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 100}

param_grid_rf = {
'classifier__estimator__n_estimators': [115, 120, 135],
    'classifier__estimator__max_depth': [8, 9, 10],
    'classifier__estimator__min_samples_split': [2, 3],
    'classifier__estimator__min_samples_leaf': [7, 8, 9],
    'classifier__estimator__class_weight': ['balanced']
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 9, 'classifier__estimator__min_samples_leaf': 8, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 135}
Best cross-validation train score: 0.85
Best cross-validation validation score: 0.66
Test-set F1-score: 0.69
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.53      0.71      0.61        55
           1       0.48      0.67      0.56        48
           2       0.69      0.81      0.75        75
           3       0.64      0.68      0.66        76
           4       0.72      0.77      0.74        73
           5       0.72      0.72      0.72        93

   micro avg       0.64      0.73 

In [6]:
# Import required libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2022winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary target
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Training and testing split done.")
print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

# Step 1: Initialize Pipeline
pipe_rf = Pipeline([
    ('classifier', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
])

print("Pipeline initialized.")

# Step 2: Define Extended Parameter Grid
# {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 100}

param_grid_rf = {
'classifier__estimator__n_estimators': [115, 120, 135],
    'classifier__estimator__max_depth': [8, 9, 10],
    'classifier__estimator__min_samples_split': [2, 3],
    'classifier__estimator__min_samples_leaf': [7, 8, 9],
    'classifier__estimator__class_weight': ['balanced']
}

# Step 3: Perform Grid Search with Cross-Validation
grid_rf = GridSearchCV(
    pipe_rf, param_grid_rf, cv=5, scoring='f1_weighted', return_train_score=True, verbose=1, n_jobs=-1
)
grid_rf.fit(X_train, y_train)

print("Grid search completed.")

# Step 4: Print the Best Parameters
print("Best parameters:\n", grid_rf.best_params_)

# Best training and validation scores
best_train_score = max(grid_rf.cv_results_['mean_train_score'])
print(f"Best cross-validation train score: {best_train_score:.2f}")
best_val_score = grid_rf.best_score_
print(f"Best cross-validation validation score: {best_val_score:.2f}")

# Step 5: Evaluate on Test Set
y_test_pred = grid_rf.best_estimator_.predict(X_test)

# Test set F1-score
test_f1_score = f1_score(y_test, y_test_pred, average='weighted')
print(f"Test-set F1-score: {test_f1_score:.2f}")

# Step 6: Detailed Evaluation
print("Classification report on the test set:\n")
print(classification_report(y_test, y_test_pred, zero_division=0))


Training and testing split done.
Training set: 338 samples
Testing set: 146 samples
Pipeline initialized.
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Grid search completed.
Best parameters:
 {'classifier__estimator__class_weight': 'balanced', 'classifier__estimator__max_depth': 10, 'classifier__estimator__min_samples_leaf': 7, 'classifier__estimator__min_samples_split': 2, 'classifier__estimator__n_estimators': 135}
Best cross-validation train score: 0.84
Best cross-validation validation score: 0.67
Test-set F1-score: 0.69
Classification report on the test set:

              precision    recall  f1-score   support

           0       0.60      0.61      0.60        56
           1       0.44      0.61      0.51        41
           2       0.61      0.67      0.64        72
           3       0.68      0.89      0.77        75
           4       0.74      0.72      0.73        76
           5       0.70      0.85      0.77        81

   micro avg       0.64      0.74

In [32]:
import pandas as pd
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, f1_score

# Load the dataset
data = pd.read_csv('xydata_2021winter.csv')

# Define target and features
target_columns_clusters = [f"Cluster{i}" for i in range(6)]  # Target variables
columns_to_drop = target_columns_clusters + ["Date", "Total_Accidents"]  # Non-predictive columns

# Retain only the most relevant features for X
selected_features = [
     "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define target and features
y = data[target_columns_clusters]  # Binary targets
X = data[selected_features]  # Selected features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Setup the RandomForestClassifier with specific parameters
rf_classifier = RandomForestClassifier(
    n_estimators=135,
    max_depth=9,
    min_samples_split=2,
    min_samples_leaf=8,
    class_weight='balanced',
    random_state=42  # for reproducibility
)

# Train the model
rf_classifier.fit(X_train, y_train)

# Evaluate the model using cross-validation on the training set
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5, scoring='f1_weighted')
print(f"Best cross-validation train score: {cv_scores.mean():.2f}")

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate F1-score on the test set, handling division by zero explicitly
test_f1 = f1_score(y_test, y_pred, average='weighted', zero_division=1)
print(f"Test-set F1-score: {test_f1:.2f}")

# Generate a classification report, handling division by zero explicitly
print("Classification report on the test set:")
print(classification_report(y_test, y_pred, zero_division=1))

# Save the model to disk
joblib.dump(rf_classifier, 'final_random_forest_model.pkl')
print("Model saved successfully!")

Best cross-validation train score: 0.69
Test-set F1-score: 0.72
Classification report on the test set:
              precision    recall  f1-score   support

           0       0.54      0.78      0.64        55
           1       0.44      0.67      0.53        48
           2       0.70      0.91      0.79        75
           3       0.62      0.79      0.69        76
           4       0.69      0.86      0.77        73
           5       0.72      0.85      0.78        93

   micro avg       0.63      0.82      0.71       420
   macro avg       0.62      0.81      0.70       420
weighted avg       0.64      0.82      0.72       420
 samples avg       0.70      0.77      0.63       420

Model saved successfully!


In [34]:
import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, classification_report

# Load the saved model
print("Loading the trained model...")
model = joblib.load('final_random_forest_model.pkl')

# Load the new dataset formatted like the training data
print("Loading the dataset...")
data = pd.read_csv('xydata_2018winter.csv')

# Define the feature columns (ensure they match those used during training)
features = [
    "Time_Period", "Stn Press (kPa)", "Dew Point Temp (°C)", "Rel Hum (%)",
    "Visibility (km)", "Temp (°C)", "Wind Dir (10s deg)", "Wind Spd (km/h)",
    "C0D-1HA", "C0D-2HA", "C0D-4HA",  # Cluster 0 historical features
    "C1D-1HA", "C1D-2HA", "C1D-4HA",  # Cluster 1 historical features
    "C2D-1HA", "C2D-2HA", "C2D-3HA",  # Cluster 2 historical features
    "C3D-1HA", "C4D-1HA", "C4D-2HA", "C5D-1HA"  # Other cluster features
]

# Define the target cluster columns
target_columns = [f"Cluster{i}" for i in range(6)]

# Ensure all features used during training are present in the dataset
for feature in features:
    if feature not in data.columns:
        data[feature] = 0  # Add missing features with a default value of 0

# Extract the features (X) and target (y) from the dataset
X = data[features]
y_true = data[target_columns]

# Generate predictions using the loaded model
print("Making predictions...")
y_pred = model.predict(X)

# Convert predictions to a DataFrame for better readability
y_pred_df = pd.DataFrame(y_pred, columns=target_columns)

# Calculate the accuracy score
accuracy = accuracy_score(y_true, y_pred_df)
print(f"Test set accuracy score: {accuracy:.2f}")

# Display a detailed classification report with zero_division=0
print("\nClassification Report:")
print(classification_report(y_true, y_pred_df, target_names=target_columns, zero_division=0))

# Display the first 5 rows of predictions alongside true labels for comparison
print("\nFirst 5 Predictions vs. True Labels:")
comparison = pd.concat([y_pred_df.head(5), y_true.head(5)], axis=1, keys=["Predicted", "True"])
print(comparison)


Loading the trained model...
Loading the dataset...
Making predictions...
Test set accuracy score: 0.19

Classification Report:
              precision    recall  f1-score   support

    Cluster0       0.41      0.69      0.52       144
    Cluster1       0.38      0.59      0.46       134
    Cluster2       0.55      0.83      0.66       206
    Cluster3       0.63      0.80      0.70       246
    Cluster4       0.63      0.86      0.73       232
    Cluster5       0.66      0.91      0.76       258

   micro avg       0.56      0.80      0.66      1220
   macro avg       0.54      0.78      0.64      1220
weighted avg       0.57      0.80      0.66      1220
 samples avg       0.42      0.60      0.46      1220


First 5 Predictions vs. True Labels:
  Predicted                                                  True           \
   Cluster0 Cluster1 Cluster2 Cluster3 Cluster4 Cluster5 Cluster0 Cluster1   
0         0        0        1        0        0        1        0        0   
1  